RAHUL-13's picture
Upload graders.py with huggingface_hub
36c2b7d verified
"""
Bug Report Structuring Environment - Grading Logic
Deterministic grading of structured bug reports against ground truth.
Returns scores in [0.0, 1.0] with partial credit for each field.
Scoring dimensions:
- title (weight: 0.15) - keyword coverage in title
- steps (weight: 0.25) - completeness of reproduction steps
- expected (weight: 0.15) - expected behavior accuracy
- actual (weight: 0.15) - actual behavior accuracy
- severity (weight: 0.15) - severity classification correctness
- environment (weight: 0.10) - environment info extraction
- format (weight: 0.05) - structural completeness
"""
from typing import Dict, Tuple
from tasks import SEVERITY_ADJACENCY, SEVERITY_LEVELS
# Weights for each scoring dimension
FIELD_WEIGHTS = {
"title": 0.15,
"steps_to_reproduce": 0.25,
"expected_behavior": 0.15,
"actual_behavior": 0.15,
"severity": 0.15,
"environment": 0.10,
"format": 0.05,
}
def _keyword_score(text: str, keywords: list) -> float:
"""
Score text based on what fraction of keywords are found.
Returns float in [0.0, 1.0].
"""
if not text or not keywords:
return 0.0
text_lower = text.lower()
matches = 0
for kw in keywords:
if isinstance(kw, str) and kw.lower() in text_lower:
matches += 1
return min(1.0, matches / max(len(keywords), 1))
def _severity_score(submitted: str, expected: str) -> float:
"""
Score severity classification.
Exact match = 1.0, adjacent = 0.5, wrong = 0.0.
"""
submitted_clean = submitted.strip().lower()
expected_clean = expected.strip().lower()
if submitted_clean not in SEVERITY_LEVELS:
return 0.0
return SEVERITY_ADJACENCY.get(expected_clean, {}).get(submitted_clean, 0.0)
def _format_score(action: dict) -> float:
"""
Score structural completeness of the submission.
Checks that all required fields are non-empty.
"""
required_fields = [
"title", "steps_to_reproduce", "expected_behavior",
"actual_behavior", "severity", "environment"
]
present = 0
for field in required_fields:
value = action.get(field, "")
if isinstance(value, str) and len(value.strip()) > 5:
present += 1
return present / len(required_fields)
def grade_submission(action: dict, task: dict) -> Tuple[float, Dict[str, float], str]:
"""
Grade a structured bug report submission against the task's ground truth.
Args:
action: dict with keys: title, steps_to_reproduce, expected_behavior,
actual_behavior, severity, environment, additional_notes
task: task definition dict from tasks.py
Returns:
Tuple of (overall_score, field_scores_dict, feedback_text)
"""
keywords = task["keywords"]
ground_truth = task["ground_truth"]
field_scores = {}
feedback_parts = []
# ── Title Score ────────────────────────────────────────────
title = action.get("title", "")
field_scores["title"] = _keyword_score(title, keywords["title"])
if field_scores["title"] < 0.5:
feedback_parts.append(
f"Title needs improvement. Include key details: "
f"the affected component and the nature of the problem."
)
elif field_scores["title"] < 1.0:
feedback_parts.append("Title captures the main issue but could be more specific.")
else:
feedback_parts.append("Title is well-written and descriptive.")
# ── Steps to Reproduce Score ──────────────────────────────
steps = action.get("steps_to_reproduce", "")
field_scores["steps_to_reproduce"] = _keyword_score(steps, keywords["steps_to_reproduce"])
if field_scores["steps_to_reproduce"] < 0.4:
feedback_parts.append(
"Steps to reproduce are incomplete. Include specific actions, "
"preconditions, and observable results at each step."
)
elif field_scores["steps_to_reproduce"] < 0.7:
feedback_parts.append(
"Steps cover the basics but are missing some important details "
"from the original report."
)
else:
feedback_parts.append("Steps to reproduce are thorough and well-structured.")
# ── Expected Behavior Score ───────────────────────────────
expected = action.get("expected_behavior", "")
field_scores["expected_behavior"] = _keyword_score(expected, keywords["expected_behavior"])
if field_scores["expected_behavior"] < 0.5:
feedback_parts.append(
"Expected behavior description is vague. Be specific about "
"what the correct behavior should be."
)
else:
feedback_parts.append("Expected behavior is clearly stated.")
# ── Actual Behavior Score ─────────────────────────────────
actual = action.get("actual_behavior", "")
field_scores["actual_behavior"] = _keyword_score(actual, keywords["actual_behavior"])
if field_scores["actual_behavior"] < 0.5:
feedback_parts.append(
"Actual behavior description is incomplete. Include the specific "
"symptoms, error messages, and observable effects."
)
else:
feedback_parts.append("Actual behavior is well-documented.")
# ── Severity Score ────────────────────────────────────────
severity = action.get("severity", "")
field_scores["severity"] = _severity_score(severity, keywords["severity"])
if field_scores["severity"] < 1.0:
expected_sev = keywords["severity"]
if field_scores["severity"] == 0.0:
feedback_parts.append(
f"Severity '{severity}' is incorrect. Consider the impact: "
f"does it cause data loss, block users, or is it cosmetic?"
)
else:
feedback_parts.append(
f"Severity '{severity}' is close but not ideal. "
f"Think about the real-world impact of this issue."
)
else:
feedback_parts.append("Severity assessment is accurate.")
# ── Environment Score ─────────────────────────────────────
env = action.get("environment", "")
field_scores["environment"] = _keyword_score(env, keywords["environment"])
if field_scores["environment"] < 0.5:
feedback_parts.append(
"Environment details are incomplete. Include OS, browser/runtime, "
"and version numbers mentioned in the report."
)
else:
feedback_parts.append("Environment information is well-captured.")
# ── Format Score ──────────────────────────────────────────
field_scores["format"] = _format_score(action)
if field_scores["format"] < 1.0:
feedback_parts.append(
"Some fields are missing or too short. "
"Ensure all required fields have meaningful content."
)
# ── Compute Overall Score ─────────────────────────────────
overall_score = sum(
FIELD_WEIGHTS[field] * field_scores[field]
for field in FIELD_WEIGHTS
)
overall_score = round(min(1.0, max(0.0, overall_score)), 4)
# Round field scores for display
field_scores = {k: round(v, 2) for k, v in field_scores.items()}
# Build feedback
feedback = f"Overall Score: {overall_score:.2f}/1.00\n\n"
feedback += "Field-by-field feedback:\n"
for part in feedback_parts:
feedback += f" β€’ {part}\n"
if overall_score >= 0.85:
feedback += "\nExcellent work! The structured report captures the key information well."
elif overall_score >= 0.6:
feedback += "\nGood effort. Some fields need refinement - review the feedback above."
else:
feedback += "\nThe report needs significant improvement. Focus on extracting all details from the original text."
return overall_score, field_scores, feedback