RAHUL-13 commited on
Commit
36c2b7d
Β·
verified Β·
1 Parent(s): 3888b41

Upload graders.py with huggingface_hub

Browse files
Files changed (1) hide show
  1. graders.py +210 -0
graders.py ADDED
@@ -0,0 +1,210 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Bug Report Structuring Environment - Grading Logic
3
+
4
+ Deterministic grading of structured bug reports against ground truth.
5
+ Returns scores in [0.0, 1.0] with partial credit for each field.
6
+
7
+ Scoring dimensions:
8
+ - title (weight: 0.15) - keyword coverage in title
9
+ - steps (weight: 0.25) - completeness of reproduction steps
10
+ - expected (weight: 0.15) - expected behavior accuracy
11
+ - actual (weight: 0.15) - actual behavior accuracy
12
+ - severity (weight: 0.15) - severity classification correctness
13
+ - environment (weight: 0.10) - environment info extraction
14
+ - format (weight: 0.05) - structural completeness
15
+ """
16
+
17
+ from typing import Dict, Tuple
18
+ from tasks import SEVERITY_ADJACENCY, SEVERITY_LEVELS
19
+
20
+ # Weights for each scoring dimension
21
+ FIELD_WEIGHTS = {
22
+ "title": 0.15,
23
+ "steps_to_reproduce": 0.25,
24
+ "expected_behavior": 0.15,
25
+ "actual_behavior": 0.15,
26
+ "severity": 0.15,
27
+ "environment": 0.10,
28
+ "format": 0.05,
29
+ }
30
+
31
+
32
+ def _keyword_score(text: str, keywords: list) -> float:
33
+ """
34
+ Score text based on what fraction of keywords are found.
35
+ Returns float in [0.0, 1.0].
36
+ """
37
+ if not text or not keywords:
38
+ return 0.0
39
+
40
+ text_lower = text.lower()
41
+ matches = 0
42
+ for kw in keywords:
43
+ if isinstance(kw, str) and kw.lower() in text_lower:
44
+ matches += 1
45
+
46
+ return min(1.0, matches / max(len(keywords), 1))
47
+
48
+
49
+ def _severity_score(submitted: str, expected: str) -> float:
50
+ """
51
+ Score severity classification.
52
+ Exact match = 1.0, adjacent = 0.5, wrong = 0.0.
53
+ """
54
+ submitted_clean = submitted.strip().lower()
55
+ expected_clean = expected.strip().lower()
56
+
57
+ if submitted_clean not in SEVERITY_LEVELS:
58
+ return 0.0
59
+
60
+ return SEVERITY_ADJACENCY.get(expected_clean, {}).get(submitted_clean, 0.0)
61
+
62
+
63
+ def _format_score(action: dict) -> float:
64
+ """
65
+ Score structural completeness of the submission.
66
+ Checks that all required fields are non-empty.
67
+ """
68
+ required_fields = [
69
+ "title", "steps_to_reproduce", "expected_behavior",
70
+ "actual_behavior", "severity", "environment"
71
+ ]
72
+ present = 0
73
+ for field in required_fields:
74
+ value = action.get(field, "")
75
+ if isinstance(value, str) and len(value.strip()) > 5:
76
+ present += 1
77
+
78
+ return present / len(required_fields)
79
+
80
+
81
+ def grade_submission(action: dict, task: dict) -> Tuple[float, Dict[str, float], str]:
82
+ """
83
+ Grade a structured bug report submission against the task's ground truth.
84
+
85
+ Args:
86
+ action: dict with keys: title, steps_to_reproduce, expected_behavior,
87
+ actual_behavior, severity, environment, additional_notes
88
+ task: task definition dict from tasks.py
89
+
90
+ Returns:
91
+ Tuple of (overall_score, field_scores_dict, feedback_text)
92
+ """
93
+ keywords = task["keywords"]
94
+ ground_truth = task["ground_truth"]
95
+
96
+ field_scores = {}
97
+ feedback_parts = []
98
+
99
+ # ── Title Score ────────────────────────────────────────────
100
+ title = action.get("title", "")
101
+ field_scores["title"] = _keyword_score(title, keywords["title"])
102
+ if field_scores["title"] < 0.5:
103
+ feedback_parts.append(
104
+ f"Title needs improvement. Include key details: "
105
+ f"the affected component and the nature of the problem."
106
+ )
107
+ elif field_scores["title"] < 1.0:
108
+ feedback_parts.append("Title captures the main issue but could be more specific.")
109
+ else:
110
+ feedback_parts.append("Title is well-written and descriptive.")
111
+
112
+ # ── Steps to Reproduce Score ──────────────────────────────
113
+ steps = action.get("steps_to_reproduce", "")
114
+ field_scores["steps_to_reproduce"] = _keyword_score(steps, keywords["steps_to_reproduce"])
115
+ if field_scores["steps_to_reproduce"] < 0.4:
116
+ feedback_parts.append(
117
+ "Steps to reproduce are incomplete. Include specific actions, "
118
+ "preconditions, and observable results at each step."
119
+ )
120
+ elif field_scores["steps_to_reproduce"] < 0.7:
121
+ feedback_parts.append(
122
+ "Steps cover the basics but are missing some important details "
123
+ "from the original report."
124
+ )
125
+ else:
126
+ feedback_parts.append("Steps to reproduce are thorough and well-structured.")
127
+
128
+ # ── Expected Behavior Score ───────────────────────────────
129
+ expected = action.get("expected_behavior", "")
130
+ field_scores["expected_behavior"] = _keyword_score(expected, keywords["expected_behavior"])
131
+ if field_scores["expected_behavior"] < 0.5:
132
+ feedback_parts.append(
133
+ "Expected behavior description is vague. Be specific about "
134
+ "what the correct behavior should be."
135
+ )
136
+ else:
137
+ feedback_parts.append("Expected behavior is clearly stated.")
138
+
139
+ # ── Actual Behavior Score ─────────────────────────────────
140
+ actual = action.get("actual_behavior", "")
141
+ field_scores["actual_behavior"] = _keyword_score(actual, keywords["actual_behavior"])
142
+ if field_scores["actual_behavior"] < 0.5:
143
+ feedback_parts.append(
144
+ "Actual behavior description is incomplete. Include the specific "
145
+ "symptoms, error messages, and observable effects."
146
+ )
147
+ else:
148
+ feedback_parts.append("Actual behavior is well-documented.")
149
+
150
+ # ── Severity Score ────────────────────────────────────────
151
+ severity = action.get("severity", "")
152
+ field_scores["severity"] = _severity_score(severity, keywords["severity"])
153
+ if field_scores["severity"] < 1.0:
154
+ expected_sev = keywords["severity"]
155
+ if field_scores["severity"] == 0.0:
156
+ feedback_parts.append(
157
+ f"Severity '{severity}' is incorrect. Consider the impact: "
158
+ f"does it cause data loss, block users, or is it cosmetic?"
159
+ )
160
+ else:
161
+ feedback_parts.append(
162
+ f"Severity '{severity}' is close but not ideal. "
163
+ f"Think about the real-world impact of this issue."
164
+ )
165
+ else:
166
+ feedback_parts.append("Severity assessment is accurate.")
167
+
168
+ # ── Environment Score ─────────────────────────────────────
169
+ env = action.get("environment", "")
170
+ field_scores["environment"] = _keyword_score(env, keywords["environment"])
171
+ if field_scores["environment"] < 0.5:
172
+ feedback_parts.append(
173
+ "Environment details are incomplete. Include OS, browser/runtime, "
174
+ "and version numbers mentioned in the report."
175
+ )
176
+ else:
177
+ feedback_parts.append("Environment information is well-captured.")
178
+
179
+ # ── Format Score ──────────────────────────────────────────
180
+ field_scores["format"] = _format_score(action)
181
+ if field_scores["format"] < 1.0:
182
+ feedback_parts.append(
183
+ "Some fields are missing or too short. "
184
+ "Ensure all required fields have meaningful content."
185
+ )
186
+
187
+ # ── Compute Overall Score ─────────────────────────────────
188
+ overall_score = sum(
189
+ FIELD_WEIGHTS[field] * field_scores[field]
190
+ for field in FIELD_WEIGHTS
191
+ )
192
+ overall_score = round(min(1.0, max(0.0, overall_score)), 4)
193
+
194
+ # Round field scores for display
195
+ field_scores = {k: round(v, 2) for k, v in field_scores.items()}
196
+
197
+ # Build feedback
198
+ feedback = f"Overall Score: {overall_score:.2f}/1.00\n\n"
199
+ feedback += "Field-by-field feedback:\n"
200
+ for part in feedback_parts:
201
+ feedback += f" β€’ {part}\n"
202
+
203
+ if overall_score >= 0.85:
204
+ feedback += "\nExcellent work! The structured report captures the key information well."
205
+ elif overall_score >= 0.6:
206
+ feedback += "\nGood effort. Some fields need refinement - review the feedback above."
207
+ else:
208
+ feedback += "\nThe report needs significant improvement. Focus on extracting all details from the original text."
209
+
210
+ return overall_score, field_scores, feedback