Moncey10 commited on
Commit
96ef5e0
·
1 Parent(s): c56fe1e

marked_pdf

Browse files
Files changed (2) hide show
  1. app.py +460 -1
  2. requirements.txt +2 -1
app.py CHANGED
@@ -26,6 +26,16 @@ try:
26
  except Exception:
27
  PdfReader = None
28
 
 
 
 
 
 
 
 
 
 
 
29
  try:
30
  from pdf2image import convert_from_bytes # requires poppler
31
  except Exception:
@@ -57,6 +67,22 @@ app = FastAPI()
57
 
58
  import os
59
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
60
  @app.get("/debug/env")
61
  def debug_env():
62
  return {
@@ -875,6 +901,142 @@ def extract_text_from_pdf(pdf_bytes: bytes, filename: str = "unknown.pdf") -> Di
875
  return {"text": extracted, "used_ocr": used_ocr, "needs_ocr": False}
876
 
877
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
878
  async def extract_text_from_upload(file: UploadFile) -> Dict[str, Any]:
879
  filename = getattr(file, "filename", "") or "upload"
880
  content_type = (getattr(file, "content_type", "") or "").lower()
@@ -947,6 +1109,180 @@ def health_llm():
947
  }
948
 
949
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
950
  @app.post("/homework/validate")
951
  async def homework_validate(
952
  student_id: int = Form(...),
@@ -981,9 +1317,51 @@ async def homework_validate(
981
  # 2) Extract student text
982
  student_info = await extract_text_from_upload(student_file)
983
  student_text = (student_info.get("text") or "").strip()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
984
 
985
  MIN_WORDS = 3 if final_question_type == "mcq" else 8
986
  if len(student_text.split()) < MIN_WORDS:
 
 
 
 
 
987
  return {
988
  "student_id": student_id,
989
  "homework_id": homework_id,
@@ -997,10 +1375,16 @@ async def homework_validate(
997
  "rule_based_remark": "Answer text could not be read clearly. Please upload a clearer file.",
998
  "student_extracted_text": student_text,
999
  "llm_used": False,
 
1000
  "extraction": {"student": {k: v for k, v in student_info.items() if k != "text"}},
1001
  }
1002
 
1003
  if student_info.get("needs_ocr") and not student_text:
 
 
 
 
 
1004
  return {
1005
  "student_id": student_id,
1006
  "homework_id": homework_id,
@@ -1014,6 +1398,7 @@ async def homework_validate(
1014
  "rule_based_remark": "This PDF looks scanned. OCR is required (install pdf2image + poppler) or upload a clearer file.",
1015
  "student_extracted_text": student_text,
1016
  "llm_used": False,
 
1017
  "extraction": {"student": {k: v for k, v in student_info.items() if k != "text"}},
1018
  }
1019
 
@@ -1136,6 +1521,12 @@ async def homework_validate(
1136
  else:
1137
  status = "Needs Review"
1138
 
 
 
 
 
 
 
1139
  return {
1140
  "student_id": student_id,
1141
  "homework_id": homework_id,
@@ -1151,6 +1542,7 @@ async def homework_validate(
1151
  "student_extracted_text": student_text,
1152
  "mcq_results": mcq_results,
1153
  "narrative_results": narrative_results,
 
1154
  "extraction": {"student": {k: v for k, v in student_info.items() if k != "text"}},
1155
  "debug": {
1156
  "erp_row_fields": list(erp_row.keys()) if erp_row else [],
@@ -1169,10 +1561,13 @@ async def homework_validate(
1169
 
1170
  # Smart fallback: if answer looks like narrative (not MCQ), treat as narrative instead
1171
  # This handles cases where question type is MCQ but student answered in narrative format
 
 
 
1172
  answer_looks_like_narrative = (
1173
  len(student_text.split()) > 15 and # More than 15 words
1174
  not has_multiple_mcq and # Not multiple numbered MCQ answers
1175
- not re.search(r"\b(option|answer|ans)\s*[:\-]?\s*[a-d]\b", _norm(student_text)) # No explicit option markers
1176
  )
1177
 
1178
  # If answer looks like narrative, redirect to narrative processing
@@ -1227,6 +1622,12 @@ async def homework_validate(
1227
  passing_threshold = mcq_credit["passing_threshold"]
1228
  status = "Verified" if match_percentage >= passing_threshold else "Needs Review"
1229
 
 
 
 
 
 
 
1230
  return {
1231
  "student_id": student_id,
1232
  "homework_id": homework_id,
@@ -1240,11 +1641,17 @@ async def homework_validate(
1240
  "rule_based_remark": f"Multiple MCQ: {correct_count}/{total_count} correct. Score: {match_percentage}% (Level: {student_level})",
1241
  "student_extracted_text": student_text,
1242
  "llm_used": False,
 
1243
  "debug": {"student_answers": student_answers_by_qid, "mcq_results": mcq_results},
1244
  "extraction": {"student": {k: v for k, v in student_info.items() if k != "text"}},
1245
  }
1246
  else:
1247
  # No correct answers in prompt - return needs review with extracted answers
 
 
 
 
 
1248
  return {
1249
  "student_id": student_id,
1250
  "homework_id": homework_id,
@@ -1258,6 +1665,7 @@ async def homework_validate(
1258
  "rule_based_remark": f"Found {len(student_answers_by_qid)} MCQ answers but no correct answers in prompt. Include 'Correct: B' for each question.",
1259
  "student_extracted_text": student_text,
1260
  "llm_used": False,
 
1261
  "debug": {"student_answers": student_answers_by_qid, "correct_answers_in_prompt": False},
1262
  "extraction": {"student": {k: v for k, v in student_info.items() if k != "text"}},
1263
  }
@@ -1265,6 +1673,11 @@ async def homework_validate(
1265
  if redirect_to_narrative:
1266
  pass # Will continue to narrative handling
1267
  elif not correct:
 
 
 
 
 
1268
  return {
1269
  "student_id": student_id,
1270
  "homework_id": homework_id,
@@ -1278,10 +1691,16 @@ async def homework_validate(
1278
  "rule_based_remark": "MCQ correct option not found in prompt. Include 'Correct: B' or similar in prompt.",
1279
  "student_extracted_text": student_text,
1280
  "llm_used": False,
 
1281
  "debug": {"correct": correct, "chosen": chosen},
1282
  "extraction": {"student": {k: v for k, v in student_info.items() if k != "text"}},
1283
  }
1284
  elif not chosen:
 
 
 
 
 
1285
  return {
1286
  "student_id": student_id,
1287
  "homework_id": homework_id,
@@ -1295,6 +1714,7 @@ async def homework_validate(
1295
  "rule_based_remark": "Student option (A/B/C/D) not detected clearly.",
1296
  "student_extracted_text": student_text,
1297
  "llm_used": False,
 
1298
  "debug": {"correct": correct, "chosen": chosen},
1299
  "extraction": {"student": {k: v for k, v in student_info.items() if k != "text"}},
1300
  }
@@ -1314,6 +1734,13 @@ async def homework_validate(
1314
  passing_threshold = mcq_credit["passing_threshold"]
1315
  status = "Verified" if match_percentage >= passing_threshold else "Needs Review"
1316
 
 
 
 
 
 
 
 
1317
  return {
1318
  "student_id": student_id,
1319
  "homework_id": homework_id,
@@ -1327,12 +1754,18 @@ async def homework_validate(
1327
  "rule_based_remark": f"{'Correct' if is_correct else 'Incorrect'}. Score: {match_percentage}% (Level: {student_level}, Credit per Q: {credit_per_q}%)",
1328
  "student_extracted_text": student_text,
1329
  "llm_used": False,
 
1330
  "debug": {"correct": correct, "chosen": chosen, "level": student_level, "credit_per_q": credit_per_q},
1331
  "extraction": {"student": {k: v for k, v in student_info.items() if k != "text"}},
1332
  }
1333
 
1334
 
1335
  if gemini_client is None:
 
 
 
 
 
1336
  return {
1337
  "student_id": student_id,
1338
  "homework_id": homework_id,
@@ -1347,6 +1780,7 @@ async def homework_validate(
1347
  "llm_used": False,
1348
  "llm_error": parse_gemini_error(GEMINI_LAST_ERROR),
1349
  "student_extracted_text": student_text,
 
1350
  "extraction": {"student": {k: v for k, v in student_info.items() if k != "text"}},
1351
  }
1352
 
@@ -1367,6 +1801,11 @@ async def homework_validate(
1367
  )
1368
 
1369
  if not response_text:
 
 
 
 
 
1370
  return {
1371
  "student_id": student_id,
1372
  "homework_id": homework_id,
@@ -1381,6 +1820,7 @@ async def homework_validate(
1381
  "llm_used": False,
1382
  "llm_error": parse_gemini_error(GEMINI_LAST_ERROR),
1383
  "student_extracted_text": student_text,
 
1384
  "extraction": {"student": {k: v for k, v in student_info.items() if k != "text"}},
1385
  }
1386
 
@@ -1388,6 +1828,11 @@ async def homework_validate(
1388
  m = re.search(r"\{.*\}", response_text, flags=re.S)
1389
  payload = json.loads(m.group(0) if m else response_text)
1390
  except Exception as e:
 
 
 
 
 
1391
  return {
1392
  "student_id": student_id,
1393
  "homework_id": homework_id,
@@ -1402,6 +1847,7 @@ async def homework_validate(
1402
  "llm_used": False,
1403
  "llm_error": {"ok": False, "error_type": "GEMINI_BAD_JSON", "message": str(e), "raw": response_text[:800]},
1404
  "student_extracted_text": student_text,
 
1405
  "extraction": {"student": {k: v for k, v in student_info.items() if k != "text"}},
1406
  }
1407
 
@@ -1412,6 +1858,11 @@ async def homework_validate(
1412
  key_points = [str(x).strip() for x in key_points if str(x).strip()]
1413
 
1414
  if not ai_reference_answer:
 
 
 
 
 
1415
  return {
1416
  "student_id": student_id,
1417
  "homework_id": homework_id,
@@ -1425,6 +1876,7 @@ async def homework_validate(
1425
  "rule_based_remark": "AI returned empty reference answer.",
1426
  "llm_used": True,
1427
  "student_extracted_text": student_text,
 
1428
  "extraction": {"student": {k: v for k, v in student_info.items() if k != "text"}},
1429
  }
1430
 
@@ -1474,6 +1926,12 @@ async def homework_validate(
1474
  else:
1475
  rule_based_remark = "Homework does not match the expected answer enough. Please review the topic and resubmit with clearer, complete points."
1476
 
 
 
 
 
 
 
1477
  return {
1478
  "student_id": student_id,
1479
  "homework_id": homework_id,
@@ -1493,6 +1951,7 @@ async def homework_validate(
1493
  "key_points": key_points,
1494
  "key_points_covered": covered,
1495
  "key_points_missing": missing,
 
1496
  "debug": {
1497
  "similarity": sim,
1498
  "coverage": coverage,
 
26
  except Exception:
27
  PdfReader = None
28
 
29
+ try:
30
+ from reportlab.pdfgen import canvas
31
+ from reportlab.lib.pagesizes import letter
32
+ from reportlab.lib import colors
33
+ from reportlab.lib.utils import ImageReader
34
+ import reportlab
35
+ except Exception as e:
36
+ reportlab = None
37
+ print(f"[WARN] reportlab import failed: {e}")
38
+
39
  try:
40
  from pdf2image import convert_from_bytes # requires poppler
41
  except Exception:
 
67
 
68
  import os
69
 
70
+ # Serve static files from outputs directory
71
+ from fastapi.staticfiles import StaticFiles
72
+ from fastapi.responses import FileResponse
73
+
74
+ # Create outputs directory if it doesn't exist
75
+ outputs_dir = os.path.join(os.path.dirname(__file__), "outputs")
76
+ os.makedirs(outputs_dir, exist_ok=True)
77
+
78
+ @app.get("/outputs/{filename}")
79
+ async def get_output_file(filename: str):
80
+ """Serve files from the outputs directory."""
81
+ filepath = os.path.join(outputs_dir, filename)
82
+ if os.path.exists(filepath):
83
+ return FileResponse(filepath)
84
+ raise HTTPException(status_code=404, detail="File not found")
85
+
86
  @app.get("/debug/env")
87
  def debug_env():
88
  return {
 
901
  return {"text": extracted, "used_ocr": used_ocr, "needs_ocr": False}
902
 
903
 
904
+ def create_annotated_pdf(
905
+ original_pdf_bytes: bytes,
906
+ mcq_results: List[Dict[str, Any]] = None,
907
+ match_percentage: int = 0,
908
+ status: str = "Needs Review",
909
+ student_level: str = "Medium"
910
+ ) -> bytes:
911
+ """
912
+ Create an annotated PDF with tickmarks showing correct/incorrect answers.
913
+
914
+ Args:
915
+ original_pdf_bytes: The original PDF file content
916
+ mcq_results: List of MCQ results with 'correct' and 'qid' fields
917
+ match_percentage: Overall match percentage
918
+ status: Validation status
919
+ student_level: Student level (Easy/Medium/Hard)
920
+
921
+ Returns:
922
+ Annotated PDF as bytes
923
+ """
924
+ if not reportlab:
925
+ print("[WARN] reportlab not available, returning original PDF")
926
+ return original_pdf_bytes
927
+
928
+ try:
929
+ from pypdf import PdfWriter, PdfReader
930
+ from io import BytesIO
931
+
932
+ # Read original PDF
933
+ original_reader = PdfReader(BytesIO(original_pdf_bytes))
934
+ writer = PdfWriter()
935
+
936
+ # Process each page
937
+ for page_num, page in enumerate(original_reader.pages):
938
+ # Get page dimensions
939
+ page_width = float(page.mediabox.width)
940
+ page_height = float(page.mediabox.height)
941
+
942
+ # Create overlay canvas for annotations
943
+ packet = BytesIO()
944
+ c = canvas.Canvas(packet, pagesize=(page_width, page_height))
945
+
946
+ # Draw tickmarks for MCQ questions
947
+ # Position marks along the right margin
948
+ if mcq_results:
949
+ y_start = page_height - 50
950
+ y_spacing = 30
951
+
952
+ # Calculate which questions to show on this page
953
+ # (show first few on first page, rest on subsequent pages)
954
+ marks_per_page = int((page_height - 100) / y_spacing)
955
+
956
+ start_idx = page_num * marks_per_page
957
+ end_idx = min(start_idx + marks_per_page, len(mcq_results))
958
+
959
+ for i in range(start_idx, end_idx):
960
+ result = mcq_results[i]
961
+ qid = result.get('qid', f'Q{i+1}')
962
+ is_correct = result.get('correct', False)
963
+
964
+ y_pos = y_start - ((i - start_idx) * y_spacing)
965
+ x_pos = page_width - 60
966
+
967
+ # Draw tick or cross
968
+ if is_correct:
969
+ # Green checkmark
970
+ c.setStrokeColor(colors.green)
971
+ c.setFillColor(colors.green)
972
+ c.setLineWidth(2)
973
+ c.circle(x_pos, y_pos, 12, fill=0)
974
+ c.setFont("Helvetica-Bold", 14)
975
+ c.drawString(x_pos - 5, y_pos - 5, "✓")
976
+ else:
977
+ # Red X mark
978
+ c.setStrokeColor(colors.red)
979
+ c.setFillColor(colors.red)
980
+ c.setLineWidth(2)
981
+ c.circle(x_pos, y_pos, 12, fill=0)
982
+ c.setFont("Helvetica-Bold", 14)
983
+ c.drawString(x_pos - 5, y_pos - 5, "✗")
984
+
985
+ # Draw question label
986
+ c.setStrokeColor(colors.black)
987
+ c.setFillColor(colors.black)
988
+ c.setFont("Helvetica", 8)
989
+ c.drawString(x_pos - 35, y_pos - 3, str(qid))
990
+
991
+ # Add header with summary on first page
992
+ if page_num == 0:
993
+ # Draw header background
994
+ c.setFillColor(colors.lightgrey)
995
+ c.rect(0, page_height - 60, page_width, 60, fill=1, stroke=0)
996
+
997
+ # Draw status text - LARGER FONT
998
+ c.setFillColor(colors.black)
999
+ c.setFont("Helvetica-Bold", 20)
1000
+
1001
+ status_color = colors.green if status == "Verified" else (
1002
+ colors.orange if status == "Partial" else colors.red
1003
+ )
1004
+ c.setFillColor(status_color)
1005
+ c.drawString(30, page_height - 30, f"Status: {status}")
1006
+
1007
+ c.setFillColor(colors.black)
1008
+ c.setFont("Helvetica-Bold", 18)
1009
+ c.drawString(250, page_height - 30, f"Score: {match_percentage}%")
1010
+ c.drawString(450, page_height - 30, f"Level: {student_level}")
1011
+
1012
+ # Draw MCQ summary
1013
+ if mcq_results:
1014
+ correct_count = sum(1 for r in mcq_results if r.get('correct'))
1015
+ total_count = len(mcq_results)
1016
+ c.setFont("Helvetica-Bold", 14)
1017
+ c.drawString(30, page_height - 50, f"MCQ: {correct_count}/{total_count} correct")
1018
+
1019
+ c.save()
1020
+ packet.seek(0)
1021
+
1022
+ # Merge overlay with original page
1023
+ overlay_reader = PdfReader(packet)
1024
+ if overlay_reader.pages:
1025
+ page.merge_page(overlay_reader.pages[0])
1026
+
1027
+ writer.add_page(page)
1028
+
1029
+ # Write the final PDF
1030
+ output = BytesIO()
1031
+ writer.write(output)
1032
+ output.seek(0)
1033
+ return output.read()
1034
+
1035
+ except Exception as e:
1036
+ print(f"[ERROR] Failed to create annotated PDF: {e}")
1037
+ return original_pdf_bytes
1038
+
1039
+
1040
  async def extract_text_from_upload(file: UploadFile) -> Dict[str, Any]:
1041
  filename = getattr(file, "filename", "") or "upload"
1042
  content_type = (getattr(file, "content_type", "") or "").lower()
 
1109
  }
1110
 
1111
 
1112
+ @app.get("/homework/annotated-url/{homework_id}/{student_id}")
1113
+ async def get_annotated_pdf_url(
1114
+ homework_id: int,
1115
+ student_id: int,
1116
+ ):
1117
+ """
1118
+ Get the URL for the annotated PDF.
1119
+ Returns JSON with the URL that can be used in your frontend.
1120
+ """
1121
+ base_url = os.getenv("APP_BASE_URL", "http://127.0.0.1:8000")
1122
+ return {
1123
+ "homework_id": homework_id,
1124
+ "student_id": student_id,
1125
+ "annotated_pdf_url": f"{base_url}/homework/annotated/{homework_id}/{student_id}"
1126
+ }
1127
+ @app.get("/homework/annotated/{homework_id}/{student_id}")
1128
+ async def get_annotated_pdf(
1129
+ homework_id: int,
1130
+ student_id: int,
1131
+ ):
1132
+ """
1133
+ Download the annotated PDF with tickmarks for a validated homework.
1134
+ This endpoint returns the PDF directly as a file download.
1135
+ """
1136
+ from fastapi.responses import Response
1137
+
1138
+ try:
1139
+ # Fetch ERP record
1140
+ erp_row = fetch_student_record(homework_id, student_id)
1141
+
1142
+ # Get submission image from ERP
1143
+ submission_image = erp_row.get("submission_image")
1144
+ if not submission_image:
1145
+ raise HTTPException(status_code=404, detail="No submission found")
1146
+
1147
+ # Download the original file
1148
+ submission_url = STORAGE_BASE + submission_image
1149
+ resp = requests.get(submission_url, timeout=30)
1150
+ resp.raise_for_status()
1151
+ original_content = resp.content
1152
+
1153
+ # Determine file type
1154
+ filename = submission_image.lower()
1155
+ is_pdf = filename.endswith('.pdf')
1156
+
1157
+ if not is_pdf:
1158
+ raise HTTPException(status_code=400, detail="Annotated PDF only available for PDF submissions")
1159
+
1160
+ # Get prompt and question type
1161
+ prompt = erp_row.get("prompt") or erp_row.get("question_prompt") or ""
1162
+ question_type = erp_row.get("question_type") or erp_row.get("type")
1163
+ student_level = fetch_student_level_from_erp(erp_row)
1164
+
1165
+ final_question_type = (question_type or "").strip().lower()
1166
+ if final_question_type not in ("mcq", "narrative", "mixed"):
1167
+ final_question_type = infer_question_type_from_prompt(prompt)
1168
+
1169
+ # Extract text from PDF
1170
+ pdf_info = extract_text_from_pdf(original_content, filename=submission_image)
1171
+ student_text = (pdf_info.get("text") or "").strip()
1172
+
1173
+ if not student_text or len(student_text) < 10:
1174
+ raise HTTPException(status_code=400, detail="Could not extract text from PDF")
1175
+
1176
+ mcq_results = []
1177
+ status = "Needs Review"
1178
+ match_percentage = 0
1179
+
1180
+ # Process based on question type
1181
+ if final_question_type == "mcq":
1182
+ correct = extract_correct_mcq_from_prompt(prompt)
1183
+ chosen = extract_mcq_choice(student_text)
1184
+
1185
+ student_answers_by_qid = extract_mcq_answers_with_qid(student_text)
1186
+
1187
+ if student_answers_by_qid:
1188
+ # Multiple MCQ
1189
+ parsed_questions = parse_questions_from_prompt(prompt)
1190
+ mcq_questions_with_answers = [q for q in parsed_questions if q.get('type') == 'mcq' and q.get('correct_answer')]
1191
+
1192
+ for qid, student_ans in student_answers_by_qid.items():
1193
+ matched = False
1194
+ for pq in mcq_questions_with_answers:
1195
+ pq_num = pq.get('qid', '').replace('Q', '').strip()
1196
+ qid_num = qid.replace('Q', '').strip()
1197
+ if pq_num == qid_num:
1198
+ is_correct = student_ans.lower() == pq.get('correct_answer', '').lower()
1199
+ mcq_results.append({
1200
+ 'qid': qid,
1201
+ 'chosen': student_ans,
1202
+ 'correct_answer': pq.get('correct_answer'),
1203
+ 'correct': is_correct
1204
+ })
1205
+ matched = True
1206
+ break
1207
+ if not matched:
1208
+ mcq_results.append({'qid': qid, 'chosen': student_ans, 'correct_answer': None, 'correct': False})
1209
+
1210
+ if mcq_results:
1211
+ correct_count = sum(1 for r in mcq_results if r.get('correct'))
1212
+ mcq_credit = mcq_partial_credit(student_level)
1213
+ match_percentage = int((correct_count * mcq_credit["credit_per_question"]) / max(1, len(mcq_results)))
1214
+ status = "Verified" if match_percentage >= mcq_credit["passing_threshold"] else "Needs Review"
1215
+ elif correct and chosen:
1216
+ is_correct = (chosen == correct)
1217
+ mcq_credit = mcq_partial_credit(student_level)
1218
+ match_percentage = mcq_credit["credit_per_question"] if is_correct else 0
1219
+ status = "Verified" if match_percentage >= mcq_credit["passing_threshold"] else "Needs Review"
1220
+ mcq_results = [{'qid': 'Q1', 'correct': is_correct, 'chosen': chosen, 'correct_answer': correct}]
1221
+
1222
+ # For narrative, calculate score using AI
1223
+ if final_question_type == "narrative" and gemini_client:
1224
+ # Generate AI reference answer
1225
+ ai_prompt = (
1226
+ f"STUDENT_LEVEL: {student_level}\n"
1227
+ f"QUESTION:\n{prompt.strip()}\n\n"
1228
+ 'Return ONLY valid JSON with keys: {"ai_reference_answer": string, "key_points": [string, ...]}.'
1229
+ )
1230
+
1231
+ response_text = generate_gemini_response(
1232
+ prompt=ai_prompt,
1233
+ system_prompt="Generate a correct reference answer for homework evaluation. Keep it aligned with the student level. Output strict JSON only.",
1234
+ max_tokens=650,
1235
+ temperature=0.3,
1236
+ )
1237
+
1238
+ if response_text:
1239
+ try:
1240
+ import re
1241
+ m = re.search(r'\{.*\}', response_text, flags=re.S)
1242
+ payload = json.loads(m.group(0) if m else response_text)
1243
+
1244
+ ai_reference_answer = (payload.get("ai_reference_answer") or "").strip()
1245
+ key_points = payload.get("key_points") or []
1246
+
1247
+ policy = level_policy(student_level)
1248
+ sim = cosine_sim(student_text, ai_reference_answer)
1249
+ covered, missing, coverage = keypoint_coverage(student_text, key_points, kp_threshold=policy["kp_thr"])
1250
+
1251
+ final = policy["w_sim"] * sim + policy["w_cov"] * coverage
1252
+ match_percentage = int(round(final * 100))
1253
+
1254
+ if match_percentage >= policy["verified"]:
1255
+ status = "Verified"
1256
+ elif match_percentage >= policy["partial"]:
1257
+ status = "Partial"
1258
+ else:
1259
+ status = "Needs Review"
1260
+ except Exception as e:
1261
+ print(f"[WARN] Failed to calculate narrative score: {e}")
1262
+
1263
+ # Create annotated PDF
1264
+ annotated_pdf = create_annotated_pdf(
1265
+ original_pdf_bytes=original_content,
1266
+ mcq_results=mcq_results,
1267
+ match_percentage=match_percentage,
1268
+ status=status,
1269
+ student_level=student_level
1270
+ )
1271
+
1272
+ # Return as file download
1273
+ return Response(
1274
+ content=annotated_pdf,
1275
+ media_type="application/pdf",
1276
+ headers={"Content-Disposition": f"inline; filename=annotated_homework_{homework_id}_{student_id}.pdf"}
1277
+ )
1278
+
1279
+ except HTTPException:
1280
+ raise
1281
+ except Exception as e:
1282
+ print(f"[ERROR] Failed to generate annotated PDF: {e}")
1283
+ raise HTTPException(status_code=500, detail=f"Failed to generate PDF: {str(e)}")
1284
+
1285
+
1286
  @app.post("/homework/validate")
1287
  async def homework_validate(
1288
  student_id: int = Form(...),
 
1317
  # 2) Extract student text
1318
  student_info = await extract_text_from_upload(student_file)
1319
  student_text = (student_info.get("text") or "").strip()
1320
+
1321
+ # Keep a copy of the original file bytes for PDF annotation
1322
+ # Reset file cursor and read again
1323
+ await student_file.seek(0)
1324
+ original_file_bytes = await student_file.read()
1325
+ await student_file.seek(0) # Reset for any further processing
1326
+
1327
+ # Check if it's a PDF
1328
+ is_pdf_submission = student_info.get("kind") == "pdf"
1329
+
1330
+ # Initialize annotated PDF filename
1331
+ annotated_pdf_filename = None
1332
+
1333
+ # Function to save annotated PDF
1334
+ def save_annotated_pdf(pdf_bytes, hw_id, stud_id, results, score, stat, lvl):
1335
+ if not pdf_bytes or len(pdf_bytes) < 100:
1336
+ return None
1337
+ try:
1338
+ outputs_dir = os.path.join(os.path.dirname(__file__), "outputs")
1339
+ os.makedirs(outputs_dir, exist_ok=True)
1340
+ filename = f"marked_{hw_id}_{stud_id}.pdf"
1341
+ filepath = os.path.join(outputs_dir, filename)
1342
+
1343
+ annotated = create_annotated_pdf(
1344
+ original_pdf_bytes=pdf_bytes,
1345
+ mcq_results=results,
1346
+ match_percentage=score,
1347
+ status=stat,
1348
+ student_level=lvl
1349
+ )
1350
+
1351
+ with open(filepath, "wb") as f:
1352
+ f.write(annotated)
1353
+ return filename
1354
+ except Exception as e:
1355
+ print(f"[WARN] Failed to save annotated PDF: {e}")
1356
+ return None
1357
 
1358
  MIN_WORDS = 3 if final_question_type == "mcq" else 8
1359
  if len(student_text.split()) < MIN_WORDS:
1360
+ # Save annotated PDF even for unreadable (with status shown)
1361
+ if is_pdf_submission and original_file_bytes:
1362
+ annotated_pdf_filename = save_annotated_pdf(
1363
+ original_file_bytes, homework_id, student_id, [], 0, "Unreadable", student_level
1364
+ )
1365
  return {
1366
  "student_id": student_id,
1367
  "homework_id": homework_id,
 
1375
  "rule_based_remark": "Answer text could not be read clearly. Please upload a clearer file.",
1376
  "student_extracted_text": student_text,
1377
  "llm_used": False,
1378
+ "annotated_pdf": annotated_pdf_filename,
1379
  "extraction": {"student": {k: v for k, v in student_info.items() if k != "text"}},
1380
  }
1381
 
1382
  if student_info.get("needs_ocr") and not student_text:
1383
+ # Save annotated PDF even for unreadable (with status shown)
1384
+ if is_pdf_submission and original_file_bytes:
1385
+ annotated_pdf_filename = save_annotated_pdf(
1386
+ original_file_bytes, homework_id, student_id, [], 0, "Unreadable", student_level
1387
+ )
1388
  return {
1389
  "student_id": student_id,
1390
  "homework_id": homework_id,
 
1398
  "rule_based_remark": "This PDF looks scanned. OCR is required (install pdf2image + poppler) or upload a clearer file.",
1399
  "student_extracted_text": student_text,
1400
  "llm_used": False,
1401
+ "annotated_pdf": annotated_pdf_filename,
1402
  "extraction": {"student": {k: v for k, v in student_info.items() if k != "text"}},
1403
  }
1404
 
 
1521
  else:
1522
  status = "Needs Review"
1523
 
1524
+ # Save annotated PDF
1525
+ if is_pdf_submission and original_file_bytes and mcq_results:
1526
+ annotated_pdf_filename = save_annotated_pdf(
1527
+ original_file_bytes, homework_id, student_id, mcq_results, final_score, status, student_level
1528
+ )
1529
+
1530
  return {
1531
  "student_id": student_id,
1532
  "homework_id": homework_id,
 
1542
  "student_extracted_text": student_text,
1543
  "mcq_results": mcq_results,
1544
  "narrative_results": narrative_results,
1545
+ "annotated_pdf": annotated_pdf_filename,
1546
  "extraction": {"student": {k: v for k, v in student_info.items() if k != "text"}},
1547
  "debug": {
1548
  "erp_row_fields": list(erp_row.keys()) if erp_row else [],
 
1561
 
1562
  # Smart fallback: if answer looks like narrative (not MCQ), treat as narrative instead
1563
  # This handles cases where question type is MCQ but student answered in narrative format
1564
+ # BUT if the answer contains Option A/B/C/D, treat as MCQ
1565
+ answer_has_mcq_option = bool(re.search(r"\b(option|answer|ans)\s*[:\-]?\s*[a-d]\b", _norm(student_text)))
1566
+
1567
  answer_looks_like_narrative = (
1568
  len(student_text.split()) > 15 and # More than 15 words
1569
  not has_multiple_mcq and # Not multiple numbered MCQ answers
1570
+ not answer_has_mcq_option # No explicit option markers
1571
  )
1572
 
1573
  # If answer looks like narrative, redirect to narrative processing
 
1622
  passing_threshold = mcq_credit["passing_threshold"]
1623
  status = "Verified" if match_percentage >= passing_threshold else "Needs Review"
1624
 
1625
+ # Save annotated PDF
1626
+ if is_pdf_submission and original_file_bytes:
1627
+ annotated_pdf_filename = save_annotated_pdf(
1628
+ original_file_bytes, homework_id, student_id, mcq_results, match_percentage, status, student_level
1629
+ )
1630
+
1631
  return {
1632
  "student_id": student_id,
1633
  "homework_id": homework_id,
 
1641
  "rule_based_remark": f"Multiple MCQ: {correct_count}/{total_count} correct. Score: {match_percentage}% (Level: {student_level})",
1642
  "student_extracted_text": student_text,
1643
  "llm_used": False,
1644
+ "annotated_pdf": annotated_pdf_filename,
1645
  "debug": {"student_answers": student_answers_by_qid, "mcq_results": mcq_results},
1646
  "extraction": {"student": {k: v for k, v in student_info.items() if k != "text"}},
1647
  }
1648
  else:
1649
  # No correct answers in prompt - return needs review with extracted answers
1650
+ # Save annotated PDF
1651
+ if is_pdf_submission and original_file_bytes:
1652
+ annotated_pdf_filename = save_annotated_pdf(
1653
+ original_file_bytes, homework_id, student_id, [], 0, "Needs Review", student_level
1654
+ )
1655
  return {
1656
  "student_id": student_id,
1657
  "homework_id": homework_id,
 
1665
  "rule_based_remark": f"Found {len(student_answers_by_qid)} MCQ answers but no correct answers in prompt. Include 'Correct: B' for each question.",
1666
  "student_extracted_text": student_text,
1667
  "llm_used": False,
1668
+ "annotated_pdf": annotated_pdf_filename,
1669
  "debug": {"student_answers": student_answers_by_qid, "correct_answers_in_prompt": False},
1670
  "extraction": {"student": {k: v for k, v in student_info.items() if k != "text"}},
1671
  }
 
1673
  if redirect_to_narrative:
1674
  pass # Will continue to narrative handling
1675
  elif not correct:
1676
+ # Save annotated PDF
1677
+ if is_pdf_submission and original_file_bytes:
1678
+ annotated_pdf_filename = save_annotated_pdf(
1679
+ original_file_bytes, homework_id, student_id, [], 0, "Needs Review", student_level
1680
+ )
1681
  return {
1682
  "student_id": student_id,
1683
  "homework_id": homework_id,
 
1691
  "rule_based_remark": "MCQ correct option not found in prompt. Include 'Correct: B' or similar in prompt.",
1692
  "student_extracted_text": student_text,
1693
  "llm_used": False,
1694
+ "annotated_pdf": annotated_pdf_filename,
1695
  "debug": {"correct": correct, "chosen": chosen},
1696
  "extraction": {"student": {k: v for k, v in student_info.items() if k != "text"}},
1697
  }
1698
  elif not chosen:
1699
+ # Save annotated PDF
1700
+ if is_pdf_submission and original_file_bytes:
1701
+ annotated_pdf_filename = save_annotated_pdf(
1702
+ original_file_bytes, homework_id, student_id, [], 0, "Needs Review", student_level
1703
+ )
1704
  return {
1705
  "student_id": student_id,
1706
  "homework_id": homework_id,
 
1714
  "rule_based_remark": "Student option (A/B/C/D) not detected clearly.",
1715
  "student_extracted_text": student_text,
1716
  "llm_used": False,
1717
+ "annotated_pdf": annotated_pdf_filename,
1718
  "debug": {"correct": correct, "chosen": chosen},
1719
  "extraction": {"student": {k: v for k, v in student_info.items() if k != "text"}},
1720
  }
 
1734
  passing_threshold = mcq_credit["passing_threshold"]
1735
  status = "Verified" if match_percentage >= passing_threshold else "Needs Review"
1736
 
1737
+ # Save annotated PDF
1738
+ mcq_results_single = [{'qid': 'Q1', 'correct': is_correct, 'chosen': chosen, 'correct_answer': correct}]
1739
+ if is_pdf_submission and original_file_bytes:
1740
+ annotated_pdf_filename = save_annotated_pdf(
1741
+ original_file_bytes, homework_id, student_id, mcq_results_single, match_percentage, status, student_level
1742
+ )
1743
+
1744
  return {
1745
  "student_id": student_id,
1746
  "homework_id": homework_id,
 
1754
  "rule_based_remark": f"{'Correct' if is_correct else 'Incorrect'}. Score: {match_percentage}% (Level: {student_level}, Credit per Q: {credit_per_q}%)",
1755
  "student_extracted_text": student_text,
1756
  "llm_used": False,
1757
+ "annotated_pdf": annotated_pdf_filename,
1758
  "debug": {"correct": correct, "chosen": chosen, "level": student_level, "credit_per_q": credit_per_q},
1759
  "extraction": {"student": {k: v for k, v in student_info.items() if k != "text"}},
1760
  }
1761
 
1762
 
1763
  if gemini_client is None:
1764
+ # Save annotated PDF
1765
+ if is_pdf_submission and original_file_bytes:
1766
+ annotated_pdf_filename = save_annotated_pdf(
1767
+ original_file_bytes, homework_id, student_id, [], 0, "Needs Review", student_level
1768
+ )
1769
  return {
1770
  "student_id": student_id,
1771
  "homework_id": homework_id,
 
1780
  "llm_used": False,
1781
  "llm_error": parse_gemini_error(GEMINI_LAST_ERROR),
1782
  "student_extracted_text": student_text,
1783
+ "annotated_pdf": annotated_pdf_filename,
1784
  "extraction": {"student": {k: v for k, v in student_info.items() if k != "text"}},
1785
  }
1786
 
 
1801
  )
1802
 
1803
  if not response_text:
1804
+ # Save annotated PDF
1805
+ if is_pdf_submission and original_file_bytes:
1806
+ annotated_pdf_filename = save_annotated_pdf(
1807
+ original_file_bytes, homework_id, student_id, [], 0, "Needs Review", student_level
1808
+ )
1809
  return {
1810
  "student_id": student_id,
1811
  "homework_id": homework_id,
 
1820
  "llm_used": False,
1821
  "llm_error": parse_gemini_error(GEMINI_LAST_ERROR),
1822
  "student_extracted_text": student_text,
1823
+ "annotated_pdf": annotated_pdf_filename,
1824
  "extraction": {"student": {k: v for k, v in student_info.items() if k != "text"}},
1825
  }
1826
 
 
1828
  m = re.search(r"\{.*\}", response_text, flags=re.S)
1829
  payload = json.loads(m.group(0) if m else response_text)
1830
  except Exception as e:
1831
+ # Save annotated PDF
1832
+ if is_pdf_submission and original_file_bytes:
1833
+ annotated_pdf_filename = save_annotated_pdf(
1834
+ original_file_bytes, homework_id, student_id, [], 0, "Needs Review", student_level
1835
+ )
1836
  return {
1837
  "student_id": student_id,
1838
  "homework_id": homework_id,
 
1847
  "llm_used": False,
1848
  "llm_error": {"ok": False, "error_type": "GEMINI_BAD_JSON", "message": str(e), "raw": response_text[:800]},
1849
  "student_extracted_text": student_text,
1850
+ "annotated_pdf": annotated_pdf_filename,
1851
  "extraction": {"student": {k: v for k, v in student_info.items() if k != "text"}},
1852
  }
1853
 
 
1858
  key_points = [str(x).strip() for x in key_points if str(x).strip()]
1859
 
1860
  if not ai_reference_answer:
1861
+ # Save annotated PDF
1862
+ if is_pdf_submission and original_file_bytes:
1863
+ annotated_pdf_filename = save_annotated_pdf(
1864
+ original_file_bytes, homework_id, student_id, [], 0, "Needs Review", student_level
1865
+ )
1866
  return {
1867
  "student_id": student_id,
1868
  "homework_id": homework_id,
 
1876
  "rule_based_remark": "AI returned empty reference answer.",
1877
  "llm_used": True,
1878
  "student_extracted_text": student_text,
1879
+ "annotated_pdf": annotated_pdf_filename,
1880
  "extraction": {"student": {k: v for k, v in student_info.items() if k != "text"}},
1881
  }
1882
 
 
1926
  else:
1927
  rule_based_remark = "Homework does not match the expected answer enough. Please review the topic and resubmit with clearer, complete points."
1928
 
1929
+ # Save annotated PDF for narrative (with status but no MCQ marks)
1930
+ if is_pdf_submission and original_file_bytes:
1931
+ annotated_pdf_filename = save_annotated_pdf(
1932
+ original_file_bytes, homework_id, student_id, [], match_pct, status, student_level
1933
+ )
1934
+
1935
  return {
1936
  "student_id": student_id,
1937
  "homework_id": homework_id,
 
1951
  "key_points": key_points,
1952
  "key_points_covered": covered,
1953
  "key_points_missing": missing,
1954
+ "annotated_pdf": annotated_pdf_filename,
1955
  "debug": {
1956
  "similarity": sim,
1957
  "coverage": coverage,
requirements.txt CHANGED
@@ -1,4 +1,4 @@
1
- fastapi
2
  uvicorn
3
  pytesseract
4
  pillow
@@ -19,5 +19,6 @@ easyocr
19
  python-docx
20
  pypdf
21
  pdf2image
 
22
  python-dotenv
23
  google-genai
 
1
+ ffastapi
2
  uvicorn
3
  pytesseract
4
  pillow
 
19
  python-docx
20
  pypdf
21
  pdf2image
22
+ reportlab
23
  python-dotenv
24
  google-genai