Spaces:

dharunkkk
/

gov_env

Sleeping

App Files Files Community

gov_env / grader.py

dharunkkk

Upload grader.py

a08f890 verified about 2 months ago

raw

history blame contribute delete

17.5 kB

	# Copyright (c) Meta Platforms, Inc. and affiliates.
	# All rights reserved.
	#
	# This source code is licensed under the BSD-style license found in the
	# LICENSE file in the root directory of this source tree.

	"""
	Deterministic grader for the Government Service Application Assistant Environment.
	"""

	from typing import Dict, Any, List, Tuple

	try:
	from .models import GovAction, GovObservation
	from .models import GovServiceState
	from .tasks import task_manager
	except ImportError:
	from models import GovAction, GovObservation
	from models import GovServiceState
	from tasks import task_manager


	class Grader:
	"""Deterministic grader for environment tasks."""

	def __init__(self):
	self.task_manager = task_manager

	def grade_action(
	self,
	action: GovAction,
	observation: GovObservation,
	state: GovServiceState,
	task_info: Dict[str, Any]
	) -> float:
	"""
	Grade an action based on the current task and state.

	Returns:
	Score between 0.0 and 1.0
	"""
	# Base score
	score = 0.0

	# Grade based on action type and task difficulty
	if task_info["expected_difficulty"] == "easy":
	score = self._grade_easy_task(action, observation, state, task_info)
	elif task_info["expected_difficulty"] == "medium":
	score = self._grade_medium_task(action, observation, state, task_info)
	elif task_info["expected_difficulty"] == "hard":
	score = self._grade_hard_task(action, observation, state, task_info)

	# Apply efficiency bonus/penalty
	optimal_steps = task_info.get("optimal_steps", 5)
	if state.step_count <= optimal_steps:
	score = min(1.0, score + 0.1) # Efficiency bonus
	elif state.step_count > optimal_steps * 2:
	score = max(0.0, score - 0.1) # Inefficiency penalty

	# Ensure score is in valid range
	return max(0.0, min(1.0, score))

	def _grade_easy_task(
	self,
	action: GovAction,
	observation: GovObservation,
	state: GovServiceState,
	task_info: Dict[str, Any]
	) -> float:
	"""Grade easy task: Identify required documents."""
	score = 0.0

	if action.action_type == "select_service":
	# Correctly selected service
	if action.service_type == task_info["service_type"]:
	score += 0.3
	else:
	score -= 0.2 # Penalty for wrong service

	elif action.action_type == "list_required_documents":
	# Check if we have required documents in observation
	required_docs = observation.required_documents
	expected_docs = self.task_manager.get_task_requirements(task_info["service_type"])

	if len(required_docs) == len(expected_docs):
	# Check if all expected documents are present
	expected_types = {doc.type for doc in expected_docs}
	observed_types = {doc.get("type", "") for doc in required_docs}

	if expected_types == observed_types:
	score += 0.6 # Perfect match
	else:
	# Partial credit for correct documents
	correct_count = len(expected_types & observed_types)
	score += 0.3 * (correct_count / len(expected_types)) if expected_types else 0
	elif len(required_docs) > 0:
	# Partial credit for having some documents
	expected_types = {doc.type for doc in expected_docs}
	observed_types = {doc.get("type", "") for doc in required_docs}
	correct_count = len(expected_types & observed_types)
	score += 0.3 * (correct_count / len(expected_types)) if expected_types else 0

	return max(0.0, min(1.0, score))

	def _grade_medium_task(
	self,
	action: GovAction,
	observation: GovObservation,
	state: GovServiceState,
	task_info: Dict[str, Any]
	) -> float:
	"""Grade medium task: Validate application."""
	score = 0.0

	if action.action_type == "select_service":
	# Correctly selected service
	if action.service_type == task_info["service_type"]:
	score += 0.2
	else:
	score -= 0.1

	elif action.action_type == "list_required_documents":
	# Check if we have required documents in observation
	required_docs = observation.required_documents
	expected_docs = self.task_manager.get_task_requirements(task_info["service_type"])

	if len(required_docs) == len(expected_docs):
	expected_types = {doc.type for doc in expected_docs}
	observed_types = {doc.get("type", "") for doc in required_docs}

	if expected_types == observed_types:
	score += 0.3
	else:
	correct_count = len(expected_types & observed_types)
	score += 0.15 * (correct_count / len(expected_types)) if expected_types else 0
	elif len(required_docs) > 0:
	expected_types = {doc.type for doc in expected_docs}
	observed_types = {doc.get("type", "") for doc in required_docs}
	correct_count = len(expected_types & observed_types)
	score += 0.15 * (correct_count / len(expected_types)) if expected_types else 0

	elif action.action_type == "validate_documents":
	# Check validation results
	validation_results = observation.validation_results
	if validation_results:
	expected_feedback = task_info.get("validation_feedback", {})

	# Check missing documents
	expected_missing = set(expected_feedback.get("missing_documents", []))
	actual_missing = set(validation_results.get("missing_documents", []))

	missing_score = 0.0
	if expected_missing:
	if actual_missing == expected_missing:
	missing_score = 0.3
	else:
	# Partial credit
	correct_missing = len(expected_missing & actual_missing)
	missing_score = 0.15 * (correct_missing / len(expected_missing))
	else:
	missing_score = 0.3 # No missing expected is good

	# Check invalid documents
	expected_invalid = expected_feedback.get("invalid_documents", [])
	actual_invalid = validation_results.get("invalid_documents", [])

	invalid_score = 0.0
	if expected_invalid:
	# Simple check: same count and types
	if len(expected_invalid) == len(actual_invalid):
	expected_types = {item.get("type", "") for item in expected_invalid}
	actual_types = {item.get("type", "") for item in actual_invalid}
	if expected_types == actual_types:
	invalid_score = 0.3
	else:
	correct_types = len(expected_types & actual_types)
	invalid_score = 0.15 * (correct_types / len(expected_types)) if expected_types else 0
	else:
	# Partial credit based on overlap
	expected_set = {(item.get("type", ""), item.get("reason", "")) for item in expected_invalid}
	actual_set = {(item.get("type", ""), item.get("reason", "")) for item in actual_invalid}
	correct_items = len(expected_set & actual_set)
	invalid_score = 0.15 * (correct_items / len(expected_invalid)) if expected_invalid else 0
	else:
	invalid_score = 0.3 # No invalid expected is good

	# Check valid documents
	expected_valid = set(expected_feedback.get("valid_documents", []))
	actual_valid = set(validation_results.get("valid_documents", []))

	valid_score = 0.0
	if expected_valid:
	if actual_valid == expected_valid:
	valid_score = 0.2
	else:
	correct_valid = len(expected_valid & actual_valid)
	valid_score = 0.1 * (correct_valid / len(expected_valid))
	else:
	valid_score = 0.2 # No valid expected is good

	score += missing_score + invalid_score + valid_score
	else:
	# No validation results yet - small penalty for premature validation
	score -= 0.1

	return max(0.0, min(1.0, score))

	def _grade_hard_task(
	self,
	action: GovAction,
	observation: GovObservation,
	state: GovServiceState,
	task_info: Dict[str, Any]
	) -> float:
	"""Grade hard task: Fix incorrect application."""
	score = 0.0

	if action.action_type == "select_service":
	# Correctly selected service
	if action.service_type == task_info["service_type"]:
	score += 0.15
	else:
	score -= 0.1

	elif action.action_type == "list_required_documents":
	# Check if we have required documents in observation
	required_docs = observation.required_documents
	expected_docs = self.task_manager.get_task_requirements(task_info["service_type"])

	if len(required_docs) == len(expected_docs):
	expected_types = {doc.type for doc in expected_docs}
	observed_types = {doc.get("type", "") for doc in required_docs}

	if expected_types == observed_types:
	score += 0.25
	else:
	correct_count = len(expected_types & observed_types)
	score += 0.125 * (correct_count / len(expected_types)) if expected_types else 0
	elif len(required_docs) > 0:
	expected_types = {doc.type for doc in expected_docs}
	observed_types = {doc.get("type", "") for doc in required_docs}
	correct_count = len(expected_types & observed_types)
	score += 0.125 * (correct_count / len(expected_types)) if expected_types else 0

	elif action.action_type == "validate_documents":
	# Check validation results
	validation_results = observation.validation_results
	if validation_results:
	expected_feedback = task_info.get("validation_feedback", {})

	# Similar to medium task but with different weighting
	expected_missing = set(expected_feedback.get("missing_documents", []))
	actual_missing = set(validation_results.get("missing_documents", []))

	missing_score = 0.0
	if expected_missing:
	if actual_missing == expected_missing:
	missing_score = 0.25
	else:
	correct_missing = len(expected_missing & actual_missing)
	missing_score = 0.125 * (correct_missing / len(expected_missing))
	else:
	missing_score = 0.25

	# Check invalid documents
	expected_invalid = expected_feedback.get("invalid_documents", [])
	actual_invalid = validation_results.get("invalid_documents", [])

	invalid_score = 0.0
	if expected_invalid:
	if len(expected_invalid) == len(actual_invalid):
	expected_types = {item.get("type", "") for item in expected_invalid}
	actual_types = {item.get("type", "") for item in actual_invalid}
	if expected_types == actual_types:
	invalid_score = 0.25
	else:
	correct_types = len(expected_types & actual_types)
	invalid_score = 0.125 * (correct_types / len(expected_types)) if expected_types else 0
	else:
	expected_set = {(item.get("type", ""), item.get("reason", "")) for item in expected_invalid}
	actual_set = {(item.get("type", ""), item.get("reason", "")) for item in actual_invalid}
	correct_items = len(expected_set & actual_set)
	invalid_score = 0.125 * (correct_items / len(expected_invalid)) if expected_invalid else 0
	else:
	invalid_score = 0.25

	# Check valid documents
	expected_valid = set(expected_feedback.get("valid_documents", []))
	actual_valid = set(validation_results.get("valid_documents", []))

	valid_score = 0.0
	if expected_valid:
	if actual_valid == expected_valid:
	valid_score = 0.15
	else:
	correct_valid = len(expected_valid & actual_valid)
	valid_score = 0.075 * (correct_valid / len(expected_valid))
	else:
	valid_score = 0.15

	score += missing_score + invalid_score + valid_score
	else:
	score -= 0.05

	elif action.action_type == "suggest_corrections":
	# Check correction suggestions
	correction_suggestions = observation.correction_suggestions
	expected_feedback = task_info.get("validation_feedback", {})

	if correction_suggestions:
	# Check if we addressed the invalid documents
	expected_invalid = expected_feedback.get("invalid_documents", [])
	expected_missing = expected_feedback.get("missing_documents", [])

	# Simple heuristic: good suggestions address the issues
	issue_count = len(expected_invalid) + len(expected_missing)
	if issue_count > 0:
	# Check if suggestions mention addressing these issues
	suggestion_text = " ".join([
	str(sugg.get("suggested_action", "")) + " " +
	str(sugg.get("issue_type", ""))
	for sugg in correction_suggestions
	]).lower()

	# Check for key terms from expected issues
	issue_terms = []
	for item in expected_invalid:
	issue_terms.append(item.get("type", "").lower())
	issue_terms.append(item.get("reason", "").lower())
	for item in expected_missing:
	issue_terms.append(item.lower())

	# Simple matching score
	matches = 0
	for term in issue_terms:
	if term and term in suggestion_text:
	matches += 1

	if issue_terms:
	suggestion_score = 0.2 * (matches / len(issue_terms))
	else:
	suggestion_score = 0.2

	score += suggestion_score
	else:
	score += 0.2 # No issues to correct
	else:
	# No suggestions when there are issues to correct
	if expected_feedback.get("invalid_documents") or expected_feedback.get("missing_documents"):
	score -= 0.1
	else:
	score += 0.1 # Correctly identified no issues

	elif action.action_type == "submit_application":
	# Check if application is ready to submit
	is_complete = observation.is_complete
	validation_results = observation.validation_results

	if validation_results and validation_results.get("is_valid") and validation_results.get("is_complete"):
	if is_complete:
	score += 0.3 # Correctly submitted valid application
	else:
	score -= 0.1 # Submitted but observation says incomplete
	elif not validation_results or not validation_results.get("is_valid"):
	if not is_complete:
	score += 0.1 # Correctly didn't submit invalid application
	else:
	score -= 0.2 # Incorrectly thinks incomplete application is complete
	else:
	score -= 0.1 # Unclear state

	return max(0.0, min(1.0, score))


	# Global grader instance
	grader = Grader()