heerjtdev commited on
Commit
3fff356
·
verified ·
1 Parent(s): d2523b4

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +109 -84
app.py CHANGED
@@ -1280,60 +1280,60 @@ Provide your grading following the mandatory output format.
1280
  # 4. NEW: SUPERVISOR AGGREGATOR
1281
  # Paper insight: Merge ensemble outputs into final decision
1282
  # ---------------------------------------------------------
1283
- class SupervisorAggregator:
1284
- """
1285
- Aggregates multiple grader outputs into a final consensus grade.
1286
- Paper uses another LLM call; we use statistical aggregation for CPU efficiency.
1287
- """
1288
 
1289
- def aggregate(self, grader_results: List[Dict], max_marks: int) -> Dict:
1290
- """
1291
- Aggregate K=3 grader results into final score.
1292
 
1293
- Returns:
1294
- - final_score: int (median of ensemble)
1295
- - disagreement: int (max - min score)
1296
- - needs_review: bool (high disagreement flag)
1297
- - consensus_analysis: str
1298
- """
1299
- scores = [r['score'] for r in grader_results]
1300
 
1301
- # Use median for robustness (paper uses supervisor LLM call)
1302
- final_score = int(np.median(scores))
1303
 
1304
- # Calculate disagreement
1305
- disagreement = max(scores) - min(scores)
1306
 
1307
- # Flag for manual review if disagreement too high
1308
- # Paper uses Dmax thresholds; we use 40% of max marks
1309
- needs_review = disagreement >= (0.4 * max_marks)
1310
 
1311
- # Merge analyses
1312
- consensus_analysis = self._merge_analyses(grader_results, final_score, disagreement)
1313
 
1314
- return {
1315
- "final_score": final_score,
1316
- "individual_scores": scores,
1317
- "disagreement": disagreement,
1318
- "needs_review": needs_review,
1319
- "consensus_analysis": consensus_analysis,
1320
- "grader_details": grader_results
1321
- }
1322
 
1323
- def _merge_analyses(self, results: List[Dict], final_score: int, disagreement: int) -> str:
1324
- """Create consensus analysis from multiple graders."""
1325
 
1326
- output = f"**Ensemble Grading Results** (Final: {final_score}, Disagreement: ±{disagreement})\n\n"
1327
 
1328
- for i, result in enumerate(results, 1):
1329
- output += f"**Grader {i} ({result['score']} points):**\n{result['analysis']}\n\n"
1330
 
1331
- if disagreement > 0:
1332
- output += f"\n⚠️ **Note:** Graders disagreed by {disagreement} points. "
1333
- if disagreement >= 5:
1334
- output += "Consider manual review."
1335
 
1336
- return output
1337
 
1338
 
1339
  # ---------------------------------------------------------
@@ -1390,7 +1390,7 @@ class EnhancedVectorSystem:
1390
  self.llm = LLMEvaluator()
1391
  self.reranker = OnnxReranker()
1392
  self.presence_checker = AnswerPresenceChecker()
1393
- self.supervisor = SupervisorAggregator()
1394
  self.all_chunks = []
1395
  self.total_chunks = 0
1396
  self.reference_summary = None # Store reference answer summary
@@ -1449,7 +1449,8 @@ class EnhancedVectorSystem:
1449
  self.reference_summary = reference_text.strip()
1450
  return f"✅ Reference answer set ({len(self.reference_summary)} chars). Will be used to calibrate grading."
1451
 
1452
- def process_query(self, question, student_answer, max_marks, enable_ensemble=True):
 
1453
  """
1454
  Enhanced grading pipeline with multi-stage processing.
1455
  """
@@ -1473,51 +1474,69 @@ class EnhancedVectorSystem:
1473
  evidence_display += f"> {expanded_context[:500]}..."
1474
 
1475
  # Stage 3: Ensemble Grading (Paper's key innovation)
1476
- if not student_answer:
1477
- return evidence_display, "Please enter a student answer to grade."
1478
-
1479
- if enable_ensemble:
1480
- # Run K=3 independent graders
1481
- grader_results = []
1482
- for grader_id in range(1, 4): # K=3 ensemble
1483
- result = self.llm.evaluate_single(
1484
- context=expanded_context,
1485
- question=question,
1486
- student_answer=student_answer,
1487
- max_marks=max_marks,
1488
- grader_id=grader_id,
1489
- reference_summary=self.reference_summary
1490
- )
1491
- grader_results.append(result)
1492
 
1493
- # Stage 4: Supervisor Aggregation
1494
- final_result = self.supervisor.aggregate(grader_results, max_marks)
1495
 
1496
- # Format output
1497
- llm_feedback = f"# 🎓 Final Grade: {final_result['final_score']}/{max_marks}\n\n"
1498
 
1499
- if final_result['needs_review']:
1500
- llm_feedback += "⚠️ **Manual Review Recommended** (High grader disagreement)\n\n"
1501
 
1502
- llm_feedback += final_result['consensus_analysis']
1503
 
1504
- # Add statistics
1505
- llm_feedback += f"\n\n---\n**Grading Statistics:**\n"
1506
- llm_feedback += f"- Individual Scores: {final_result['individual_scores']}\n"
1507
- llm_feedback += f"- Score Range: {min(final_result['individual_scores'])}-{max(final_result['individual_scores'])}\n"
1508
- llm_feedback += f"- Disagreement: ±{final_result['disagreement']} points\n"
1509
 
1510
- else:
1511
- # Single grader mode (for comparison)
1512
- result = self.llm.evaluate_single(
1513
- context=expanded_context,
1514
- question=question,
1515
- student_answer=student_answer,
1516
- max_marks=max_marks,
1517
- grader_id=1,
1518
- reference_summary=self.reference_summary
1519
- )
1520
- llm_feedback = f"# 🎓 Grade: {result['score']}/{max_marks}\n\n{result['analysis']}"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1521
 
1522
  return evidence_display, llm_feedback
1523
 
@@ -1587,9 +1606,15 @@ with gr.Blocks(title="EduGenius AI Grader - Enhanced", theme=gr.themes.Soft()) a
1587
  outputs=[ref_status]
1588
  )
1589
 
 
 
 
 
 
 
1590
  run_btn.click(
1591
  system.process_query,
1592
- inputs=[q_input, a_input, max_marks, ensemble_check],
1593
  outputs=[evidence_box, grade_box]
1594
  )
1595
 
 
1280
  # 4. NEW: SUPERVISOR AGGREGATOR
1281
  # Paper insight: Merge ensemble outputs into final decision
1282
  # ---------------------------------------------------------
1283
+ # class SupervisorAggregator:
1284
+ # """
1285
+ # Aggregates multiple grader outputs into a final consensus grade.
1286
+ # Paper uses another LLM call; we use statistical aggregation for CPU efficiency.
1287
+ # """
1288
 
1289
+ # def aggregate(self, grader_results: List[Dict], max_marks: int) -> Dict:
1290
+ # """
1291
+ # Aggregate K=3 grader results into final score.
1292
 
1293
+ # Returns:
1294
+ # - final_score: int (median of ensemble)
1295
+ # - disagreement: int (max - min score)
1296
+ # - needs_review: bool (high disagreement flag)
1297
+ # - consensus_analysis: str
1298
+ # """
1299
+ # scores = [r['score'] for r in grader_results]
1300
 
1301
+ # # Use median for robustness (paper uses supervisor LLM call)
1302
+ # final_score = int(np.median(scores))
1303
 
1304
+ # # Calculate disagreement
1305
+ # disagreement = max(scores) - min(scores)
1306
 
1307
+ # # Flag for manual review if disagreement too high
1308
+ # # Paper uses Dmax thresholds; we use 40% of max marks
1309
+ # needs_review = disagreement >= (0.4 * max_marks)
1310
 
1311
+ # # Merge analyses
1312
+ # consensus_analysis = self._merge_analyses(grader_results, final_score, disagreement)
1313
 
1314
+ # return {
1315
+ # "final_score": final_score,
1316
+ # "individual_scores": scores,
1317
+ # "disagreement": disagreement,
1318
+ # "needs_review": needs_review,
1319
+ # "consensus_analysis": consensus_analysis,
1320
+ # "grader_details": grader_results
1321
+ # }
1322
 
1323
+ # def _merge_analyses(self, results: List[Dict], final_score: int, disagreement: int) -> str:
1324
+ # """Create consensus analysis from multiple graders."""
1325
 
1326
+ # output = f"**Ensemble Grading Results** (Final: {final_score}, Disagreement: ±{disagreement})\n\n"
1327
 
1328
+ # for i, result in enumerate(results, 1):
1329
+ # output += f"**Grader {i} ({result['score']} points):**\n{result['analysis']}\n\n"
1330
 
1331
+ # if disagreement > 0:
1332
+ # output += f"\n⚠️ **Note:** Graders disagreed by {disagreement} points. "
1333
+ # if disagreement >= 5:
1334
+ # output += "Consider manual review."
1335
 
1336
+ # return output
1337
 
1338
 
1339
  # ---------------------------------------------------------
 
1390
  self.llm = LLMEvaluator()
1391
  self.reranker = OnnxReranker()
1392
  self.presence_checker = AnswerPresenceChecker()
1393
+ # self.supervisor = SupervisorAggregator()
1394
  self.all_chunks = []
1395
  self.total_chunks = 0
1396
  self.reference_summary = None # Store reference answer summary
 
1449
  self.reference_summary = reference_text.strip()
1450
  return f"✅ Reference answer set ({len(self.reference_summary)} chars). Will be used to calibrate grading."
1451
 
1452
+ # def process_query(self, question, student_answer, max_marks, enable_ensemble=True):
1453
+ def process_query(self, question, student_answer, max_marks):
1454
  """
1455
  Enhanced grading pipeline with multi-stage processing.
1456
  """
 
1474
  evidence_display += f"> {expanded_context[:500]}..."
1475
 
1476
  # Stage 3: Ensemble Grading (Paper's key innovation)
1477
+ # if not student_answer:
1478
+ # return evidence_display, "Please enter a student answer to grade."
1479
+
1480
+ # if enable_ensemble:
1481
+ # # Run K=3 independent graders
1482
+ # grader_results = []
1483
+ # for grader_id in range(1, 4): # K=3 ensemble
1484
+ # result = self.llm.evaluate_single(
1485
+ # context=expanded_context,
1486
+ # question=question,
1487
+ # student_answer=student_answer,
1488
+ # max_marks=max_marks,
1489
+ # grader_id=grader_id,
1490
+ # reference_summary=self.reference_summary
1491
+ # )
1492
+ # grader_results.append(result)
1493
 
1494
+ # # Stage 4: Supervisor Aggregation
1495
+ # final_result = self.supervisor.aggregate(grader_results, max_marks)
1496
 
1497
+ # # Format output
1498
+ # llm_feedback = f"# 🎓 Final Grade: {final_result['final_score']}/{max_marks}\n\n"
1499
 
1500
+ # if final_result['needs_review']:
1501
+ # llm_feedback += "⚠️ **Manual Review Recommended** (High grader disagreement)\n\n"
1502
 
1503
+ # llm_feedback += final_result['consensus_analysis']
1504
 
1505
+ # # Add statistics
1506
+ # llm_feedback += f"\n\n---\n**Grading Statistics:**\n"
1507
+ # llm_feedback += f"- Individual Scores: {final_result['individual_scores']}\n"
1508
+ # llm_feedback += f"- Score Range: {min(final_result['individual_scores'])}-{max(final_result['individual_scores'])}\n"
1509
+ # llm_feedback += f"- Disagreement: ±{final_result['disagreement']} points\n"
1510
 
1511
+ # else:
1512
+ # # Single grader mode (for comparison)
1513
+ # result = self.llm.evaluate_single(
1514
+ # context=expanded_context,
1515
+ # question=question,
1516
+ # student_answer=student_answer,
1517
+ # max_marks=max_marks,
1518
+ # grader_id=1,
1519
+ # reference_summary=self.reference_summary
1520
+ # )
1521
+ # llm_feedback = f"# 🎓 Grade: {result['score']}/{max_marks}\n\n{result['analysis']}"
1522
+
1523
+ # return evidence_display, llm_feedback
1524
+
1525
+ # Stage 3: Single Grading
1526
+ if not student_answer:
1527
+ return evidence_display, "Please enter a student answer to grade."
1528
+
1529
+ # Single grader call
1530
+ result = self.llm.evaluate_single(
1531
+ context=expanded_context,
1532
+ question=question,
1533
+ student_answer=student_answer,
1534
+ max_marks=max_marks,
1535
+ grader_id=1,
1536
+ reference_summary=self.reference_summary
1537
+ )
1538
+
1539
+ llm_feedback = f"# 🎓 Grade: {result['score']}/{max_marks}\n\n{result['analysis']}"
1540
 
1541
  return evidence_display, llm_feedback
1542
 
 
1606
  outputs=[ref_status]
1607
  )
1608
 
1609
+ # run_btn.click(
1610
+ # system.process_query,
1611
+ # inputs=[q_input, a_input, max_marks, ensemble_check],
1612
+ # outputs=[evidence_box, grade_box]
1613
+ # )
1614
+
1615
  run_btn.click(
1616
  system.process_query,
1617
+ inputs=[q_input, a_input, max_marks], # Removed ensemble_check
1618
  outputs=[evidence_box, grade_box]
1619
  )
1620