Spaces:

RAHUL-13
/

bug-report-structuring-env

Sleeping

App Files Files Community

bug-report-structuring-env / graders.py

RAHUL-13

Upload graders.py with huggingface_hub

36c2b7d verified 2 months ago

raw

history blame contribute delete

8.49 kB

	"""
	Bug Report Structuring Environment - Grading Logic

	Deterministic grading of structured bug reports against ground truth.
	Returns scores in [0.0, 1.0] with partial credit for each field.

	Scoring dimensions:
	- title (weight: 0.15) - keyword coverage in title
	- steps (weight: 0.25) - completeness of reproduction steps
	- expected (weight: 0.15) - expected behavior accuracy
	- actual (weight: 0.15) - actual behavior accuracy
	- severity (weight: 0.15) - severity classification correctness
	- environment (weight: 0.10) - environment info extraction
	- format (weight: 0.05) - structural completeness
	"""

	from typing import Dict, Tuple
	from tasks import SEVERITY_ADJACENCY, SEVERITY_LEVELS

	# Weights for each scoring dimension
	FIELD_WEIGHTS = {
	"title": 0.15,
	"steps_to_reproduce": 0.25,
	"expected_behavior": 0.15,
	"actual_behavior": 0.15,
	"severity": 0.15,
	"environment": 0.10,
	"format": 0.05,
	}


	def _keyword_score(text: str, keywords: list) -> float:
	"""
	Score text based on what fraction of keywords are found.
	Returns float in [0.0, 1.0].
	"""
	if not text or not keywords:
	return 0.0

	text_lower = text.lower()
	matches = 0
	for kw in keywords:
	if isinstance(kw, str) and kw.lower() in text_lower:
	matches += 1

	return min(1.0, matches / max(len(keywords), 1))


	def _severity_score(submitted: str, expected: str) -> float:
	"""
	Score severity classification.
	Exact match = 1.0, adjacent = 0.5, wrong = 0.0.
	"""
	submitted_clean = submitted.strip().lower()
	expected_clean = expected.strip().lower()

	if submitted_clean not in SEVERITY_LEVELS:
	return 0.0

	return SEVERITY_ADJACENCY.get(expected_clean, {}).get(submitted_clean, 0.0)


	def _format_score(action: dict) -> float:
	"""
	Score structural completeness of the submission.
	Checks that all required fields are non-empty.
	"""
	required_fields = [
	"title", "steps_to_reproduce", "expected_behavior",
	"actual_behavior", "severity", "environment"
	]
	present = 0
	for field in required_fields:
	value = action.get(field, "")
	if isinstance(value, str) and len(value.strip()) > 5:
	present += 1

	return present / len(required_fields)


	def grade_submission(action: dict, task: dict) -> Tuple[float, Dict[str, float], str]:
	"""
	Grade a structured bug report submission against the task's ground truth.

	Args:
	action: dict with keys: title, steps_to_reproduce, expected_behavior,
	actual_behavior, severity, environment, additional_notes
	task: task definition dict from tasks.py

	Returns:
	Tuple of (overall_score, field_scores_dict, feedback_text)
	"""
	keywords = task["keywords"]
	ground_truth = task["ground_truth"]

	field_scores = {}
	feedback_parts = []

	# ── Title Score ────────────────────────────────────────────
	title = action.get("title", "")
	field_scores["title"] = _keyword_score(title, keywords["title"])
	if field_scores["title"] < 0.5:
	feedback_parts.append(
	f"Title needs improvement. Include key details: "
	f"the affected component and the nature of the problem."
	)
	elif field_scores["title"] < 1.0:
	feedback_parts.append("Title captures the main issue but could be more specific.")
	else:
	feedback_parts.append("Title is well-written and descriptive.")

	# ── Steps to Reproduce Score ──────────────────────────────
	steps = action.get("steps_to_reproduce", "")
	field_scores["steps_to_reproduce"] = _keyword_score(steps, keywords["steps_to_reproduce"])
	if field_scores["steps_to_reproduce"] < 0.4:
	feedback_parts.append(
	"Steps to reproduce are incomplete. Include specific actions, "
	"preconditions, and observable results at each step."
	)
	elif field_scores["steps_to_reproduce"] < 0.7:
	feedback_parts.append(
	"Steps cover the basics but are missing some important details "
	"from the original report."
	)
	else:
	feedback_parts.append("Steps to reproduce are thorough and well-structured.")

	# ── Expected Behavior Score ───────────────────────────────
	expected = action.get("expected_behavior", "")
	field_scores["expected_behavior"] = _keyword_score(expected, keywords["expected_behavior"])
	if field_scores["expected_behavior"] < 0.5:
	feedback_parts.append(
	"Expected behavior description is vague. Be specific about "
	"what the correct behavior should be."
	)
	else:
	feedback_parts.append("Expected behavior is clearly stated.")

	# ── Actual Behavior Score ─────────────────────────────────
	actual = action.get("actual_behavior", "")
	field_scores["actual_behavior"] = _keyword_score(actual, keywords["actual_behavior"])
	if field_scores["actual_behavior"] < 0.5:
	feedback_parts.append(
	"Actual behavior description is incomplete. Include the specific "
	"symptoms, error messages, and observable effects."
	)
	else:
	feedback_parts.append("Actual behavior is well-documented.")

	# ── Severity Score ────────────────────────────────────────
	severity = action.get("severity", "")
	field_scores["severity"] = _severity_score(severity, keywords["severity"])
	if field_scores["severity"] < 1.0:
	expected_sev = keywords["severity"]
	if field_scores["severity"] == 0.0:
	feedback_parts.append(
	f"Severity '{severity}' is incorrect. Consider the impact: "
	f"does it cause data loss, block users, or is it cosmetic?"
	)
	else:
	feedback_parts.append(
	f"Severity '{severity}' is close but not ideal. "
	f"Think about the real-world impact of this issue."
	)
	else:
	feedback_parts.append("Severity assessment is accurate.")

	# ── Environment Score ─────────────────────────────────────
	env = action.get("environment", "")
	field_scores["environment"] = _keyword_score(env, keywords["environment"])
	if field_scores["environment"] < 0.5:
	feedback_parts.append(
	"Environment details are incomplete. Include OS, browser/runtime, "
	"and version numbers mentioned in the report."
	)
	else:
	feedback_parts.append("Environment information is well-captured.")

	# ── Format Score ──────────────────────────────────────────
	field_scores["format"] = _format_score(action)
	if field_scores["format"] < 1.0:
	feedback_parts.append(
	"Some fields are missing or too short. "
	"Ensure all required fields have meaningful content."
	)

	# ── Compute Overall Score ─────────────────────────────────
	overall_score = sum(
	FIELD_WEIGHTS[field] * field_scores[field]
	for field in FIELD_WEIGHTS
	)
	overall_score = round(min(1.0, max(0.0, overall_score)), 4)

	# Round field scores for display
	field_scores = {k: round(v, 2) for k, v in field_scores.items()}

	# Build feedback
	feedback = f"Overall Score: {overall_score:.2f}/1.00\n\n"
	feedback += "Field-by-field feedback:\n"
	for part in feedback_parts:
	feedback += f" • {part}\n"

	if overall_score >= 0.85:
	feedback += "\nExcellent work! The structured report captures the key information well."
	elif overall_score >= 0.6:
	feedback += "\nGood effort. Some fields need refinement - review the feedback above."
	else:
	feedback += "\nThe report needs significant improvement. Focus on extracting all details from the original text."

	return overall_score, field_scores, feedback