Spaces:
Running
Running
Update app.py
Browse files
app.py
CHANGED
|
@@ -1280,60 +1280,60 @@ Provide your grading following the mandatory output format.
|
|
| 1280 |
# 4. NEW: SUPERVISOR AGGREGATOR
|
| 1281 |
# Paper insight: Merge ensemble outputs into final decision
|
| 1282 |
# ---------------------------------------------------------
|
| 1283 |
-
class SupervisorAggregator:
|
| 1284 |
-
|
| 1285 |
-
|
| 1286 |
-
|
| 1287 |
-
|
| 1288 |
|
| 1289 |
-
|
| 1290 |
-
|
| 1291 |
-
|
| 1292 |
|
| 1293 |
-
|
| 1294 |
-
|
| 1295 |
-
|
| 1296 |
-
|
| 1297 |
-
|
| 1298 |
-
|
| 1299 |
-
|
| 1300 |
|
| 1301 |
-
|
| 1302 |
-
|
| 1303 |
|
| 1304 |
-
|
| 1305 |
-
|
| 1306 |
|
| 1307 |
-
|
| 1308 |
-
|
| 1309 |
-
|
| 1310 |
|
| 1311 |
-
|
| 1312 |
-
|
| 1313 |
|
| 1314 |
-
|
| 1315 |
-
|
| 1316 |
-
|
| 1317 |
-
|
| 1318 |
-
|
| 1319 |
-
|
| 1320 |
-
|
| 1321 |
-
|
| 1322 |
|
| 1323 |
-
|
| 1324 |
-
|
| 1325 |
|
| 1326 |
-
|
| 1327 |
|
| 1328 |
-
|
| 1329 |
-
|
| 1330 |
|
| 1331 |
-
|
| 1332 |
-
|
| 1333 |
-
|
| 1334 |
-
|
| 1335 |
|
| 1336 |
-
|
| 1337 |
|
| 1338 |
|
| 1339 |
# ---------------------------------------------------------
|
|
@@ -1390,7 +1390,7 @@ class EnhancedVectorSystem:
|
|
| 1390 |
self.llm = LLMEvaluator()
|
| 1391 |
self.reranker = OnnxReranker()
|
| 1392 |
self.presence_checker = AnswerPresenceChecker()
|
| 1393 |
-
self.supervisor = SupervisorAggregator()
|
| 1394 |
self.all_chunks = []
|
| 1395 |
self.total_chunks = 0
|
| 1396 |
self.reference_summary = None # Store reference answer summary
|
|
@@ -1449,7 +1449,8 @@ class EnhancedVectorSystem:
|
|
| 1449 |
self.reference_summary = reference_text.strip()
|
| 1450 |
return f"✅ Reference answer set ({len(self.reference_summary)} chars). Will be used to calibrate grading."
|
| 1451 |
|
| 1452 |
-
def process_query(self, question, student_answer, max_marks, enable_ensemble=True):
|
|
|
|
| 1453 |
"""
|
| 1454 |
Enhanced grading pipeline with multi-stage processing.
|
| 1455 |
"""
|
|
@@ -1473,51 +1474,69 @@ class EnhancedVectorSystem:
|
|
| 1473 |
evidence_display += f"> {expanded_context[:500]}..."
|
| 1474 |
|
| 1475 |
# Stage 3: Ensemble Grading (Paper's key innovation)
|
| 1476 |
-
if not student_answer:
|
| 1477 |
-
|
| 1478 |
-
|
| 1479 |
-
if enable_ensemble:
|
| 1480 |
-
|
| 1481 |
-
|
| 1482 |
-
|
| 1483 |
-
|
| 1484 |
-
|
| 1485 |
-
|
| 1486 |
-
|
| 1487 |
-
|
| 1488 |
-
|
| 1489 |
-
|
| 1490 |
-
|
| 1491 |
-
|
| 1492 |
|
| 1493 |
-
|
| 1494 |
-
|
| 1495 |
|
| 1496 |
-
|
| 1497 |
-
|
| 1498 |
|
| 1499 |
-
|
| 1500 |
-
|
| 1501 |
|
| 1502 |
-
|
| 1503 |
|
| 1504 |
-
|
| 1505 |
-
|
| 1506 |
-
|
| 1507 |
-
|
| 1508 |
-
|
| 1509 |
|
| 1510 |
-
else:
|
| 1511 |
-
|
| 1512 |
-
|
| 1513 |
-
|
| 1514 |
-
|
| 1515 |
-
|
| 1516 |
-
|
| 1517 |
-
|
| 1518 |
-
|
| 1519 |
-
|
| 1520 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1521 |
|
| 1522 |
return evidence_display, llm_feedback
|
| 1523 |
|
|
@@ -1587,9 +1606,15 @@ with gr.Blocks(title="EduGenius AI Grader - Enhanced", theme=gr.themes.Soft()) a
|
|
| 1587 |
outputs=[ref_status]
|
| 1588 |
)
|
| 1589 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1590 |
run_btn.click(
|
| 1591 |
system.process_query,
|
| 1592 |
-
inputs=[q_input, a_input, max_marks, ensemble_check
|
| 1593 |
outputs=[evidence_box, grade_box]
|
| 1594 |
)
|
| 1595 |
|
|
|
|
| 1280 |
# 4. NEW: SUPERVISOR AGGREGATOR
|
| 1281 |
# Paper insight: Merge ensemble outputs into final decision
|
| 1282 |
# ---------------------------------------------------------
|
| 1283 |
+
# class SupervisorAggregator:
|
| 1284 |
+
# """
|
| 1285 |
+
# Aggregates multiple grader outputs into a final consensus grade.
|
| 1286 |
+
# Paper uses another LLM call; we use statistical aggregation for CPU efficiency.
|
| 1287 |
+
# """
|
| 1288 |
|
| 1289 |
+
# def aggregate(self, grader_results: List[Dict], max_marks: int) -> Dict:
|
| 1290 |
+
# """
|
| 1291 |
+
# Aggregate K=3 grader results into final score.
|
| 1292 |
|
| 1293 |
+
# Returns:
|
| 1294 |
+
# - final_score: int (median of ensemble)
|
| 1295 |
+
# - disagreement: int (max - min score)
|
| 1296 |
+
# - needs_review: bool (high disagreement flag)
|
| 1297 |
+
# - consensus_analysis: str
|
| 1298 |
+
# """
|
| 1299 |
+
# scores = [r['score'] for r in grader_results]
|
| 1300 |
|
| 1301 |
+
# # Use median for robustness (paper uses supervisor LLM call)
|
| 1302 |
+
# final_score = int(np.median(scores))
|
| 1303 |
|
| 1304 |
+
# # Calculate disagreement
|
| 1305 |
+
# disagreement = max(scores) - min(scores)
|
| 1306 |
|
| 1307 |
+
# # Flag for manual review if disagreement too high
|
| 1308 |
+
# # Paper uses Dmax thresholds; we use 40% of max marks
|
| 1309 |
+
# needs_review = disagreement >= (0.4 * max_marks)
|
| 1310 |
|
| 1311 |
+
# # Merge analyses
|
| 1312 |
+
# consensus_analysis = self._merge_analyses(grader_results, final_score, disagreement)
|
| 1313 |
|
| 1314 |
+
# return {
|
| 1315 |
+
# "final_score": final_score,
|
| 1316 |
+
# "individual_scores": scores,
|
| 1317 |
+
# "disagreement": disagreement,
|
| 1318 |
+
# "needs_review": needs_review,
|
| 1319 |
+
# "consensus_analysis": consensus_analysis,
|
| 1320 |
+
# "grader_details": grader_results
|
| 1321 |
+
# }
|
| 1322 |
|
| 1323 |
+
# def _merge_analyses(self, results: List[Dict], final_score: int, disagreement: int) -> str:
|
| 1324 |
+
# """Create consensus analysis from multiple graders."""
|
| 1325 |
|
| 1326 |
+
# output = f"**Ensemble Grading Results** (Final: {final_score}, Disagreement: ±{disagreement})\n\n"
|
| 1327 |
|
| 1328 |
+
# for i, result in enumerate(results, 1):
|
| 1329 |
+
# output += f"**Grader {i} ({result['score']} points):**\n{result['analysis']}\n\n"
|
| 1330 |
|
| 1331 |
+
# if disagreement > 0:
|
| 1332 |
+
# output += f"\n⚠️ **Note:** Graders disagreed by {disagreement} points. "
|
| 1333 |
+
# if disagreement >= 5:
|
| 1334 |
+
# output += "Consider manual review."
|
| 1335 |
|
| 1336 |
+
# return output
|
| 1337 |
|
| 1338 |
|
| 1339 |
# ---------------------------------------------------------
|
|
|
|
| 1390 |
self.llm = LLMEvaluator()
|
| 1391 |
self.reranker = OnnxReranker()
|
| 1392 |
self.presence_checker = AnswerPresenceChecker()
|
| 1393 |
+
# self.supervisor = SupervisorAggregator()
|
| 1394 |
self.all_chunks = []
|
| 1395 |
self.total_chunks = 0
|
| 1396 |
self.reference_summary = None # Store reference answer summary
|
|
|
|
| 1449 |
self.reference_summary = reference_text.strip()
|
| 1450 |
return f"✅ Reference answer set ({len(self.reference_summary)} chars). Will be used to calibrate grading."
|
| 1451 |
|
| 1452 |
+
# def process_query(self, question, student_answer, max_marks, enable_ensemble=True):
|
| 1453 |
+
def process_query(self, question, student_answer, max_marks):
|
| 1454 |
"""
|
| 1455 |
Enhanced grading pipeline with multi-stage processing.
|
| 1456 |
"""
|
|
|
|
| 1474 |
evidence_display += f"> {expanded_context[:500]}..."
|
| 1475 |
|
| 1476 |
# Stage 3: Ensemble Grading (Paper's key innovation)
|
| 1477 |
+
# if not student_answer:
|
| 1478 |
+
# return evidence_display, "Please enter a student answer to grade."
|
| 1479 |
+
|
| 1480 |
+
# if enable_ensemble:
|
| 1481 |
+
# # Run K=3 independent graders
|
| 1482 |
+
# grader_results = []
|
| 1483 |
+
# for grader_id in range(1, 4): # K=3 ensemble
|
| 1484 |
+
# result = self.llm.evaluate_single(
|
| 1485 |
+
# context=expanded_context,
|
| 1486 |
+
# question=question,
|
| 1487 |
+
# student_answer=student_answer,
|
| 1488 |
+
# max_marks=max_marks,
|
| 1489 |
+
# grader_id=grader_id,
|
| 1490 |
+
# reference_summary=self.reference_summary
|
| 1491 |
+
# )
|
| 1492 |
+
# grader_results.append(result)
|
| 1493 |
|
| 1494 |
+
# # Stage 4: Supervisor Aggregation
|
| 1495 |
+
# final_result = self.supervisor.aggregate(grader_results, max_marks)
|
| 1496 |
|
| 1497 |
+
# # Format output
|
| 1498 |
+
# llm_feedback = f"# 🎓 Final Grade: {final_result['final_score']}/{max_marks}\n\n"
|
| 1499 |
|
| 1500 |
+
# if final_result['needs_review']:
|
| 1501 |
+
# llm_feedback += "⚠️ **Manual Review Recommended** (High grader disagreement)\n\n"
|
| 1502 |
|
| 1503 |
+
# llm_feedback += final_result['consensus_analysis']
|
| 1504 |
|
| 1505 |
+
# # Add statistics
|
| 1506 |
+
# llm_feedback += f"\n\n---\n**Grading Statistics:**\n"
|
| 1507 |
+
# llm_feedback += f"- Individual Scores: {final_result['individual_scores']}\n"
|
| 1508 |
+
# llm_feedback += f"- Score Range: {min(final_result['individual_scores'])}-{max(final_result['individual_scores'])}\n"
|
| 1509 |
+
# llm_feedback += f"- Disagreement: ±{final_result['disagreement']} points\n"
|
| 1510 |
|
| 1511 |
+
# else:
|
| 1512 |
+
# # Single grader mode (for comparison)
|
| 1513 |
+
# result = self.llm.evaluate_single(
|
| 1514 |
+
# context=expanded_context,
|
| 1515 |
+
# question=question,
|
| 1516 |
+
# student_answer=student_answer,
|
| 1517 |
+
# max_marks=max_marks,
|
| 1518 |
+
# grader_id=1,
|
| 1519 |
+
# reference_summary=self.reference_summary
|
| 1520 |
+
# )
|
| 1521 |
+
# llm_feedback = f"# 🎓 Grade: {result['score']}/{max_marks}\n\n{result['analysis']}"
|
| 1522 |
+
|
| 1523 |
+
# return evidence_display, llm_feedback
|
| 1524 |
+
|
| 1525 |
+
# Stage 3: Single Grading
|
| 1526 |
+
if not student_answer:
|
| 1527 |
+
return evidence_display, "Please enter a student answer to grade."
|
| 1528 |
+
|
| 1529 |
+
# Single grader call
|
| 1530 |
+
result = self.llm.evaluate_single(
|
| 1531 |
+
context=expanded_context,
|
| 1532 |
+
question=question,
|
| 1533 |
+
student_answer=student_answer,
|
| 1534 |
+
max_marks=max_marks,
|
| 1535 |
+
grader_id=1,
|
| 1536 |
+
reference_summary=self.reference_summary
|
| 1537 |
+
)
|
| 1538 |
+
|
| 1539 |
+
llm_feedback = f"# 🎓 Grade: {result['score']}/{max_marks}\n\n{result['analysis']}"
|
| 1540 |
|
| 1541 |
return evidence_display, llm_feedback
|
| 1542 |
|
|
|
|
| 1606 |
outputs=[ref_status]
|
| 1607 |
)
|
| 1608 |
|
| 1609 |
+
# run_btn.click(
|
| 1610 |
+
# system.process_query,
|
| 1611 |
+
# inputs=[q_input, a_input, max_marks, ensemble_check],
|
| 1612 |
+
# outputs=[evidence_box, grade_box]
|
| 1613 |
+
# )
|
| 1614 |
+
|
| 1615 |
run_btn.click(
|
| 1616 |
system.process_query,
|
| 1617 |
+
inputs=[q_input, a_input, max_marks], # Removed ensemble_check
|
| 1618 |
outputs=[evidence_box, grade_box]
|
| 1619 |
)
|
| 1620 |
|