Moncey10 commited on
Commit
09a3aa9
·
1 Parent(s): 5fe9776

image returned pdf

Browse files
Files changed (1) hide show
  1. app.py +125 -423
app.py CHANGED
@@ -10,15 +10,11 @@ from fastapi import FastAPI, UploadFile, File, Form, HTTPException
10
  from fastapi.middleware.cors import CORSMiddleware
11
  from PIL import Image, ImageOps, ImageFilter
12
  import pytesseract
13
- <<<<<<< HEAD
14
  import os
15
 
16
  # Serve static files from outputs directory
17
  from fastapi.staticfiles import StaticFiles
18
  from fastapi.responses import FileResponse
19
- =======
20
-
21
- >>>>>>> cdb5b148e5facdea1aec264a5b4d0b6293132b6e
22
  from dotenv import load_dotenv
23
  load_dotenv()
24
 
@@ -34,7 +30,6 @@ except Exception:
34
  PdfReader = None
35
 
36
  try:
37
- <<<<<<< HEAD
38
  from reportlab.pdfgen import canvas
39
  from reportlab.lib.pagesizes import letter
40
  from reportlab.lib import colors
@@ -45,8 +40,6 @@ except Exception as e:
45
  print(f"[WARN] reportlab import failed: {e}")
46
 
47
  try:
48
- =======
49
- >>>>>>> cdb5b148e5facdea1aec264a5b4d0b6293132b6e
50
  from pdf2image import convert_from_bytes # requires poppler
51
  except Exception:
52
  convert_from_bytes = None
@@ -62,7 +55,6 @@ except Exception as e:
62
  genai = None
63
  print(f"[WARN] google-genai import failed: {e}")
64
 
65
- <<<<<<< HEAD
66
  # ✅ Google Cloud Vision SDK (for better handwritten OCR)
67
  try:
68
  from google.cloud import vision
@@ -119,13 +111,6 @@ def debug_env():
119
  "num_keys": len(GOOGLE_API_KEYS),
120
  "has_openai_key": bool(os.getenv("OPENAI_API_KEY")),
121
  }
122
- =======
123
-
124
- # =========================================================
125
- # ✅ FASTAPI APP INSTANCE
126
- # =========================================================
127
- app = FastAPI()
128
- >>>>>>> cdb5b148e5facdea1aec264a5b4d0b6293132b6e
129
  app.add_middleware(
130
  CORSMiddleware,
131
  allow_origins=["*"],
@@ -134,33 +119,20 @@ app.add_middleware(
134
  allow_headers=["*"],
135
  )
136
 
137
- <<<<<<< HEAD
138
 
139
 
140
- =======
141
- # =========================================================
142
- # ✅ TESSERACT PATH
143
- # =========================================================
144
- >>>>>>> cdb5b148e5facdea1aec264a5b4d0b6293132b6e
145
  if os.name == "nt":
146
  pytesseract.pytesseract.tesseract_cmd = r"C:\Program Files\Tesseract-OCR\tesseract.exe"
147
  else:
148
  pytesseract.pytesseract.tesseract_cmd = "/usr/bin/tesseract"
149
 
150
 
151
- <<<<<<< HEAD
152
 
153
- =======
154
- # =========================================================
155
- # ✅ ERP CONFIG
156
- # =========================================================
157
- >>>>>>> cdb5b148e5facdea1aec264a5b4d0b6293132b6e
158
  ERP_BASE = os.getenv("ERP_BASE", "https://erp.triz.co.in/lms_data")
159
  STORAGE_BASE = os.getenv("STORAGE_BASE", "https://erp.triz.co.in/storage/student/")
160
  ERP_TOKEN = os.getenv("ERP_TOKEN", "")
161
 
162
 
163
- <<<<<<< HEAD
164
  def get_public_base_url() -> str:
165
  """
166
  Returns the public base URL of this server.
@@ -279,58 +251,27 @@ def _init_gemini_client(key_index: int = 0) -> None:
279
  return
280
 
281
  api_key = GOOGLE_API_KEYS[key_index]
282
- =======
283
- # =========================================================
284
- # ✅ GEMINI CONFIG
285
- # =========================================================
286
- GOOGLE_API_KEY = (os.getenv("GOOGLE_API_KEY") or "").strip()
287
- GEMINI_MODEL = (os.getenv("GEMINI_MODEL", "models/gemini-2.0-flash") or "").strip()
288
- if GEMINI_MODEL and not GEMINI_MODEL.startswith("models/"):
289
- GEMINI_MODEL = "models/" + GEMINI_MODEL
290
-
291
- gemini_client = None
292
- GEMINI_LAST_ERROR = ""
293
-
294
-
295
- def _init_gemini_client() -> None:
296
- global gemini_client, GEMINI_LAST_ERROR
297
-
298
- if gemini_client is not None:
299
- return
300
- >>>>>>> cdb5b148e5facdea1aec264a5b4d0b6293132b6e
301
 
302
  if not genai:
303
  GEMINI_LAST_ERROR = "google-genai not installed / import failed"
304
  gemini_client = None
305
  return
306
 
307
- <<<<<<< HEAD
308
  if not api_key:
309
  GEMINI_LAST_ERROR = f"GOOGLE_API_KEY_{key_index + 1} not set"
310
- =======
311
- if not GOOGLE_API_KEY:
312
- GEMINI_LAST_ERROR = "GOOGLE_API_KEY not set"
313
- >>>>>>> cdb5b148e5facdea1aec264a5b4d0b6293132b6e
314
  gemini_client = None
315
  return
316
 
317
  try:
318
- <<<<<<< HEAD
319
  gemini_client = genai.Client(api_key=api_key)
320
  GEMINI_LAST_ERROR = ""
321
  print(f"[INFO] Gemini client initialized with key #{key_index + 1}")
322
- =======
323
- gemini_client = genai.Client(api_key=GOOGLE_API_KEY)
324
- GEMINI_LAST_ERROR = ""
325
- print("[INFO] Gemini client initialized")
326
- >>>>>>> cdb5b148e5facdea1aec264a5b4d0b6293132b6e
327
  except Exception as e:
328
  gemini_client = None
329
  GEMINI_LAST_ERROR = str(e)
330
  print(f"[WARN] Gemini init failed: {GEMINI_LAST_ERROR}")
331
 
332
 
333
- <<<<<<< HEAD
334
  def _is_rate_limit_error(error_msg: str) -> bool:
335
  """Check if the error is a rate limit error (429) or service unavailable (503)."""
336
  if not error_msg:
@@ -373,9 +314,6 @@ def _rotate_to_next_key() -> bool:
373
 
374
 
375
  _init_gemini_client(0)
376
- =======
377
- _init_gemini_client()
378
- >>>>>>> cdb5b148e5facdea1aec264a5b4d0b6293132b6e
379
 
380
 
381
  def parse_gemini_error(error_msg: str) -> dict:
@@ -391,7 +329,6 @@ def parse_gemini_error(error_msg: str) -> dict:
391
  return {"ok": False, "error_type": "GEMINI_ERROR", "message": msg}
392
 
393
 
394
- <<<<<<< HEAD
395
 
396
  def extract_qid_from_prompt(prompt: str, erp_row: dict = None) -> str:
397
  """
@@ -438,32 +375,22 @@ def extract_qid_from_prompt(prompt: str, erp_row: dict = None) -> str:
438
  return "Q1"
439
 
440
 
441
- =======
442
- >>>>>>> cdb5b148e5facdea1aec264a5b4d0b6293132b6e
443
  def generate_gemini_response(
444
  prompt: str,
445
  system_prompt: str = "",
446
  max_tokens: int = 650,
447
  temperature: float = 0.3,
448
  ) -> str:
449
- <<<<<<< HEAD
450
  global GEMINI_LAST_ERROR, gemini_client, rate_limited_keys
451
- =======
452
- global GEMINI_LAST_ERROR
453
- >>>>>>> cdb5b148e5facdea1aec264a5b4d0b6293132b6e
454
 
455
  if gemini_client is None:
456
  if not GEMINI_LAST_ERROR:
457
  GEMINI_LAST_ERROR = "Gemini client not initialized"
458
- <<<<<<< HEAD
459
  # Try to reinitialize if we have keys available
460
  if GOOGLE_API_KEYS and current_key_index not in rate_limited_keys:
461
  _init_gemini_client(current_key_index)
462
  if gemini_client is None:
463
  return ""
464
- =======
465
- return ""
466
- >>>>>>> cdb5b148e5facdea1aec264a5b4d0b6293132b6e
467
 
468
  try:
469
  contents = []
@@ -481,7 +408,6 @@ def generate_gemini_response(
481
  GEMINI_LAST_ERROR = ""
482
  return text
483
  except Exception as e:
484
- <<<<<<< HEAD
485
  error_msg = str(e)
486
  print(f"[ERROR] Gemini call failed: {error_msg}")
487
 
@@ -493,10 +419,6 @@ def generate_gemini_response(
493
  return generate_gemini_response(prompt, system_prompt, max_tokens, temperature)
494
 
495
  GEMINI_LAST_ERROR = error_msg
496
- =======
497
- GEMINI_LAST_ERROR = str(e)
498
- print(f"[ERROR] Gemini call failed: {GEMINI_LAST_ERROR}")
499
- >>>>>>> cdb5b148e5facdea1aec264a5b4d0b6293132b6e
500
  return ""
501
 
502
  import time
@@ -530,13 +452,7 @@ def cheap_overlap_score(student_text: str, prompt: str) -> int:
530
  return int(round(min(0.6, overlap) * 100)) # cap at 60
531
 
532
 
533
- <<<<<<< HEAD
534
 
535
- =======
536
- # =========================================================
537
- # ✅ SMALL UTILS
538
- # =========================================================
539
- >>>>>>> cdb5b148e5facdea1aec264a5b4d0b6293132b6e
540
  def _norm(s: str) -> str:
541
  return re.sub(r"\s+", " ", (s or "").strip().lower())
542
 
@@ -571,7 +487,6 @@ def level_policy(student_level: str) -> dict:
571
  return {"w_sim": 0.6, "w_cov": 0.4, "verified": 75, "partial": 55, "kp_thr": 0.20}
572
 
573
 
574
- <<<<<<< HEAD
575
  def mcq_partial_credit(student_level: str) -> dict:
576
  """
577
  Returns partial credit percentage for MCQ questions based on student level.
@@ -592,8 +507,6 @@ def mcq_partial_credit(student_level: str) -> dict:
592
  return {"credit_per_question": 75, "passing_threshold": 75}
593
 
594
 
595
- =======
596
- >>>>>>> cdb5b148e5facdea1aec264a5b4d0b6293132b6e
597
  def keypoint_coverage(student_text: str, key_points: List[str], kp_threshold: float) -> Tuple[List[str], List[str], float]:
598
  covered, missing = [], []
599
  for kp in key_points:
@@ -611,15 +524,8 @@ def keypoint_coverage(student_text: str, key_points: List[str], kp_threshold: fl
611
  return covered, missing, coverage
612
 
613
 
614
- <<<<<<< HEAD
615
 
616
  def infer_question_type_from_prompt(prompt: str, student_text: str = "") -> str:
617
- =======
618
- # =========================================================
619
- # ✅ QUESTION TYPE INFERENCE + MCQ PARSING
620
- # =========================================================
621
- def infer_question_type_from_prompt(prompt: str) -> str:
622
- >>>>>>> cdb5b148e5facdea1aec264a5b4d0b6293132b6e
623
  p = _norm(prompt)
624
 
625
  # Explicit markers - check for (mcq) first since it's common in parentheses
@@ -628,7 +534,6 @@ def infer_question_type_from_prompt(prompt: str) -> str:
628
  if re.search(r"\btype\s*:\s*narrative\b", p) or re.search(r"\bquestion_type\s*:\s*narrative\b", p):
629
  return "narrative"
630
 
631
- <<<<<<< HEAD
632
  # Heuristic: options A/B/C/D exist in prompt -> likely MCQ
633
  if re.search(r"\b(a|b|c|d)\s*[\)\.]\s+", p) or "option a" in p or "option b" in p:
634
  return "mcq"
@@ -646,11 +551,6 @@ def infer_question_type_from_prompt(prompt: str) -> str:
646
  # If answer starts with A. or B. etc.
647
  if re.search(r"^[a-d]\.\s+", s.strip()):
648
  return "mcq"
649
- =======
650
- # Heuristic: options A/B/C/D exist -> likely MCQ
651
- if re.search(r"\b(a|b|c|d)\s*[\)\.]\s+", p) or "option a" in p or "option b" in p:
652
- return "mcq"
653
- >>>>>>> cdb5b148e5facdea1aec264a5b4d0b6293132b6e
654
 
655
  return "narrative"
656
 
@@ -715,7 +615,6 @@ def parse_questions_from_prompt(prompt: str) -> List[Dict[str, Any]]:
715
 
716
  # Check for correct answer (for MCQ)
717
  if current_type == 'mcq':
718
- <<<<<<< HEAD
719
  # First check: is this line "Correct Answer(s):" with nothing after it?
720
  # If so, we need to look for the answer on the next line
721
  if re.search(r'^correct\s*answer\s*\(?s\)?\s*[:\.]?\s*$', line, re.IGNORECASE):
@@ -748,12 +647,6 @@ def parse_questions_from_prompt(prompt: str) -> List[Dict[str, Any]]:
748
  else:
749
  # Try to extract first letter
750
  current_correct = correct_text[0].upper() if correct_text else None
751
- =======
752
- # Look for "Correct Answer(s):" or "Correct:" or "Answer:"
753
- correct_match = re.search(r'(?:Correct\s*(?:Answer)?|Answer)[:.]\s*(?:[A-D]\.?\s*)?(.+)', line, re.IGNORECASE)
754
- if correct_match and not current_correct:
755
- current_correct = correct_match.group(1).strip()
756
- >>>>>>> cdb5b148e5facdea1aec264a5b4d0b6293132b6e
757
 
758
  # Don't forget the last question
759
  if current_q is not None:
@@ -767,11 +660,7 @@ def parse_questions_from_prompt(prompt: str) -> List[Dict[str, Any]]:
767
  # If no questions parsed, fall back to old behavior
768
  if not questions:
769
  qtype = infer_question_type_from_prompt(prompt)
770
- <<<<<<< HEAD
771
  return [{'qid': extract_qid_from_prompt(prompt), 'type': qtype, 'question': prompt, 'correct_answer': None}]
772
- =======
773
- return [{'qid': 'Q1', 'type': qtype, 'question': prompt, 'correct_answer': None}]
774
- >>>>>>> cdb5b148e5facdea1aec264a5b4d0b6293132b6e
775
 
776
  return questions
777
 
@@ -803,7 +692,6 @@ def extract_mcq_choice(text: str) -> str:
803
  return ""
804
 
805
 
806
- <<<<<<< HEAD
807
  def extract_mcq_answers_with_qid(text: str) -> Dict[str, str]:
808
  """
809
  Extract MCQ answers WITH question numbers from student text.
@@ -859,8 +747,6 @@ def extract_mcq_answers_with_qid(text: str) -> Dict[str, str]:
859
  return results
860
 
861
 
862
- =======
863
- >>>>>>> cdb5b148e5facdea1aec264a5b4d0b6293132b6e
864
  def extract_correct_mcq_from_prompt(prompt: str) -> str:
865
  """
866
  This is IMPORTANT:
@@ -868,7 +754,6 @@ def extract_correct_mcq_from_prompt(prompt: str) -> str:
868
  - Correct: B
869
  - Answer: C
870
  - correct_option: D
871
- <<<<<<< HEAD
872
  - Correct Answer(s): A. Devdatta
873
  or JSON: {"correct_option":"B"}
874
 
@@ -877,9 +762,6 @@ def extract_correct_mcq_from_prompt(prompt: str) -> str:
877
  - "Correct Answer(s): A. Devdatta"
878
  - "Correct: B"
879
  - "Answer: C"
880
- =======
881
- or JSON: {"correct_option":"B"}
882
- >>>>>>> cdb5b148e5facdea1aec264a5b4d0b6293132b6e
883
  """
884
  p = (prompt or "").strip()
885
  if not p:
@@ -896,7 +778,6 @@ def extract_correct_mcq_from_prompt(prompt: str) -> str:
896
  except Exception:
897
  pass
898
 
899
- <<<<<<< HEAD
900
  # Text prompt support - new format: "Correct Answer(s): A. Devdatta" or "Correct Answer: B"
901
  t = _norm(p)
902
 
@@ -919,10 +800,6 @@ def extract_correct_mcq_from_prompt(prompt: str) -> str:
919
  return m1c.group(1)
920
 
921
  # Pattern 2: "Correct: A" or "Answer: B" (original pattern)
922
- =======
923
- # Text prompt support
924
- t = _norm(p)
925
- >>>>>>> cdb5b148e5facdea1aec264a5b4d0b6293132b6e
926
  m = re.search(r"\b(correct|answer|ans)\s*[:\-]?\s*\(?\s*([a-d])\s*\)?\b", t)
927
  if m:
928
  return m.group(2)
@@ -930,13 +807,7 @@ def extract_correct_mcq_from_prompt(prompt: str) -> str:
930
  return ""
931
 
932
 
933
- <<<<<<< HEAD
934
 
935
- =======
936
- # =========================================================
937
- # ✅ ERP HELPERS
938
- # =========================================================
939
- >>>>>>> cdb5b148e5facdea1aec264a5b4d0b6293132b6e
940
  def _erp_get(params: dict) -> list:
941
  headers = {}
942
  if ERP_TOKEN:
@@ -968,7 +839,6 @@ def fetch_student_level_from_erp(row: Dict[str, Any]) -> str:
968
  return "Medium"
969
 
970
 
971
- <<<<<<< HEAD
972
 
973
  def _preprocess_for_ocr(img: Image.Image) -> Image.Image:
974
  """
@@ -1037,25 +907,6 @@ def _extract_text_google_vision(image_bytes: bytes) -> str:
1037
  return ""
1038
 
1039
 
1040
- =======
1041
- # =========================================================
1042
- # ✅ OCR + TEXT EXTRACTION
1043
- # =========================================================
1044
- def _preprocess_for_ocr(img: Image.Image) -> Image.Image:
1045
- img = img.convert("L")
1046
- img = ImageOps.autocontrast(img)
1047
-
1048
- w, h = img.size
1049
- if max(w, h) < 1600:
1050
- scale = 1600 / max(w, h)
1051
- img = img.resize((int(w * scale), int(h * scale)))
1052
-
1053
- img = img.filter(ImageFilter.SHARPEN)
1054
- img = img.point(lambda p: 255 if p > 170 else 0)
1055
- return img
1056
-
1057
-
1058
- >>>>>>> cdb5b148e5facdea1aec264a5b4d0b6293132b6e
1059
  def extract_text_from_image(image_bytes: bytes, filename: str = "unknown") -> str:
1060
  if not image_bytes or len(image_bytes) < 50:
1061
  raise HTTPException(status_code=400, detail=f"Invalid file: '{filename}' - empty/too small")
@@ -1072,7 +923,6 @@ def extract_text_from_image(image_bytes: bytes, filename: str = "unknown") -> st
1072
  head = image_bytes[:12]
1073
  raise HTTPException(status_code=400, detail=f"Invalid image format: '{filename}' (header={head})")
1074
 
1075
- <<<<<<< HEAD
1076
  # First try Google Cloud Vision (better for handwriting)
1077
  if vision_client:
1078
  gv_text = _extract_text_google_vision(image_bytes)
@@ -1080,8 +930,6 @@ def extract_text_from_image(image_bytes: bytes, filename: str = "unknown") -> st
1080
  return _clean_extracted_text(gv_text)
1081
 
1082
  # Fallback to Tesseract with improved preprocessing
1083
- =======
1084
- >>>>>>> cdb5b148e5facdea1aec264a5b4d0b6293132b6e
1085
  try:
1086
  img = Image.open(io.BytesIO(image_bytes))
1087
  except Exception as e:
@@ -1089,7 +937,6 @@ def extract_text_from_image(image_bytes: bytes, filename: str = "unknown") -> st
1089
 
1090
  img = _preprocess_for_ocr(img)
1091
 
1092
- <<<<<<< HEAD
1093
  # Try multiple OCR configurations for better handwritten recognition
1094
  ocr_configs = [
1095
  "--oem 3 --psm 6", # Default
@@ -1118,16 +965,6 @@ def extract_text_from_image(image_bytes: bytes, filename: str = "unknown") -> st
1118
  raise HTTPException(status_code=500, detail=f"OCR failed: {e}")
1119
 
1120
  text = (best_text or "").strip()
1121
- =======
1122
- try:
1123
- text = pytesseract.image_to_string(img, lang="eng", config="--oem 3 --psm 6")
1124
- except pytesseract.TesseractNotFoundError:
1125
- raise HTTPException(status_code=500, detail="Tesseract OCR not found. Install it / fix path.")
1126
- except Exception as e:
1127
- raise HTTPException(status_code=500, detail=f"OCR failed: {e}")
1128
-
1129
- text = (text or "").strip()
1130
- >>>>>>> cdb5b148e5facdea1aec264a5b4d0b6293132b6e
1131
  text = re.sub(r"[ \t]+", " ", text)
1132
  return text
1133
 
@@ -1179,7 +1016,6 @@ def extract_text_from_pdf(pdf_bytes: bytes, filename: str = "unknown.pdf") -> Di
1179
  return {"text": extracted, "used_ocr": False, "needs_ocr": True}
1180
  try:
1181
  used_ocr = True
1182
- <<<<<<< HEAD
1183
  # Higher DPI for better handwritten OCR
1184
  pages = convert_from_bytes(pdf_bytes, dpi=300)
1185
  page_texts = []
@@ -1205,23 +1041,12 @@ def extract_text_from_pdf(pdf_bytes: bytes, filename: str = "unknown.pdf") -> Di
1205
  if img:
1206
  img = _preprocess_for_ocr(img)
1207
  extracted = pytesseract.image_to_string(img, lang="eng", config="--oem 3 --psm 6") or ""
1208
- =======
1209
- pages = convert_from_bytes(pdf_bytes, dpi=250)
1210
- page_texts = []
1211
- for img in pages:
1212
- img = _preprocess_for_ocr(img)
1213
- t = pytesseract.image_to_string(img, lang="eng", config="--oem 3 --psm 6") or ""
1214
- if t.strip():
1215
- page_texts.append(t)
1216
- extracted = _clean_extracted_text("\n\n".join(page_texts))
1217
- >>>>>>> cdb5b148e5facdea1aec264a5b4d0b6293132b6e
1218
  except Exception as e:
1219
  return {"text": extracted, "used_ocr": used_ocr, "needs_ocr": True, "ocr_error": str(e)}
1220
 
1221
  return {"text": extracted, "used_ocr": used_ocr, "needs_ocr": False}
1222
 
1223
 
1224
- <<<<<<< HEAD
1225
  def get_question_positions_from_pdf(pdf_bytes: bytes) -> Dict[int, List[Dict]]:
1226
  """
1227
  Detect question number positions in a PDF.
@@ -1494,8 +1319,6 @@ def create_annotated_pdf(
1494
  print(f"[ERROR] Failed to create annotated PDF: {e}")
1495
  return original_pdf_bytes
1496
 
1497
- =======
1498
- >>>>>>> cdb5b148e5facdea1aec264a5b4d0b6293132b6e
1499
  async def extract_text_from_upload(file: UploadFile) -> Dict[str, Any]:
1500
  filename = getattr(file, "filename", "") or "upload"
1501
  content_type = (getattr(file, "content_type", "") or "").lower()
@@ -1545,13 +1368,7 @@ async def extract_text_from_upload(file: UploadFile) -> Dict[str, Any]:
1545
 
1546
 
1547
 
1548
- <<<<<<< HEAD
1549
 
1550
- =======
1551
- # =========================================================
1552
- # ✅ ROUTES
1553
- # =========================================================
1554
- >>>>>>> cdb5b148e5facdea1aec264a5b4d0b6293132b6e
1555
  @app.get("/health")
1556
  def health():
1557
  return {"status": "ok"}
@@ -1560,7 +1377,6 @@ def health():
1560
  @app.get("/health/llm")
1561
  def health_llm():
1562
  return {
1563
- <<<<<<< HEAD
1564
  "ok": bool(gemini_client) and bool(GOOGLE_API_KEYS),
1565
  "gemini": {
1566
  "sdk_import_ok": genai is not None,
@@ -1568,12 +1384,6 @@ def health_llm():
1568
  "num_keys_configured": len(GOOGLE_API_KEYS),
1569
  "current_key_index": current_key_index + 1 if GOOGLE_API_KEYS else 0,
1570
  "rate_limited_keys": list(rate_limited_keys),
1571
- =======
1572
- "ok": bool(gemini_client) and bool(GOOGLE_API_KEY),
1573
- "gemini": {
1574
- "sdk_import_ok": genai is not None,
1575
- "configured": bool(GOOGLE_API_KEY),
1576
- >>>>>>> cdb5b148e5facdea1aec264a5b4d0b6293132b6e
1577
  "client_ready": gemini_client is not None,
1578
  "model": GEMINI_MODEL,
1579
  "last_error": GEMINI_LAST_ERROR if GEMINI_LAST_ERROR else None,
@@ -1581,7 +1391,6 @@ def health_llm():
1581
  }
1582
 
1583
 
1584
- <<<<<<< HEAD
1585
  @app.get("/homework/annotated-url/{homework_id}/{student_id}")
1586
  async def get_annotated_pdf_url(
1587
  homework_id: int,
@@ -1994,13 +1803,10 @@ def build_per_question_results(
1994
  return ai_evaluate_per_question(prompt, student_text, student_level)
1995
 
1996
 
1997
- =======
1998
- >>>>>>> cdb5b148e5facdea1aec264a5b4d0b6293132b6e
1999
  @app.post("/homework/validate")
2000
  async def homework_validate(
2001
  student_id: int = Form(...),
2002
  homework_id: int = Form(...),
2003
- <<<<<<< HEAD
2004
  student_file: UploadFile = File(...),
2005
  ):
2006
  # 0) Fetch ERP record -> get all fields automatically
@@ -2030,32 +1836,118 @@ async def homework_validate(
2030
  if final_question_type not in ("mcq", "narrative", "mixed"):
2031
  final_question_type = infer_question_type_from_prompt(prompt, student_text)
2032
 
2033
- =======
2034
- sub_institute_id: int = Form(...),
2035
- syear: str = Form(...),
2036
- prompt: str = Form(...),
2037
- student_file: UploadFile = File(...),
2038
- ):
2039
- # 0) Fetch ERP record -> get student_level automatically
2040
- erp_row = fetch_student_record(homework_id, student_id)
2041
- student_level = fetch_student_level_from_erp(erp_row)
2042
- policy = level_policy(student_level)
2043
-
2044
- >>>>>>> cdb5b148e5facdea1aec264a5b4d0b6293132b6e
2045
  # 1) Infer question_type from prompt automatically (NO EXTRA FIELD)
2046
  # Try to parse mixed questions first
2047
  parsed_questions = parse_questions_from_prompt(prompt)
2048
  has_mcq = any(q.get('type') == 'mcq' for q in parsed_questions)
2049
  has_narrative = any(q.get('type') == 'narrative' for q in parsed_questions)
2050
 
2051
- <<<<<<< HEAD
2052
- # Check if it's a PDF
2053
- is_pdf_submission = student_info.get("kind") == "pdf"
2054
-
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2055
  # Initialize annotated PDF filename
2056
  annotated_pdf_filename = None
2057
  annotated_pdf_url = None
2058
-
2059
  # Function to save annotated PDF — returns (filename, public_url)
2060
  def save_annotated_pdf(pdf_bytes, hw_id, stud_id, results, score, stat, lvl, qtype="mcq"):
2061
  if not pdf_bytes or len(pdf_bytes) < 100:
@@ -2066,16 +1958,22 @@ async def homework_validate(
2066
  ts = int(time.time())
2067
  filename = f"marked_{hw_id}_{stud_id}_{ts}.pdf"
2068
  filepath = os.path.join(outputs_dir, filename)
2069
-
 
 
 
 
 
 
2070
  annotated = create_annotated_pdf(
2071
- original_pdf_bytes=pdf_bytes,
2072
  mcq_results=results,
2073
  match_percentage=score,
2074
  status=stat,
2075
  student_level=lvl,
2076
  question_type=qtype
2077
  )
2078
-
2079
  with open(filepath, "wb") as f:
2080
  f.write(annotated)
2081
  return filename, build_pdf_url(filename)
@@ -2086,36 +1984,17 @@ async def homework_validate(
2086
  MIN_WORDS = 3 if final_question_type == "mcq" else 8
2087
  if len(student_text.split()) < MIN_WORDS:
2088
  # Save annotated PDF even for unreadable (with status shown)
2089
- if is_pdf_submission and original_file_bytes:
2090
  # Show circle mark for unreadable
2091
  unreadable_result = [{'qid': extract_qid_from_prompt(prompt, erp_row), 'correct': None, 'chosen': 'Unreadable', 'correct_answer': 'N/A'}]
2092
  annotated_pdf_filename, annotated_pdf_url = save_annotated_pdf(
2093
  original_file_bytes, homework_id, student_id, unreadable_result, 0, "Unreadable", student_level
2094
  )
2095
- =======
2096
- # Determine overall question type for backwards compatibility
2097
- if has_mcq and has_narrative:
2098
- question_type = "mixed"
2099
- elif has_mcq:
2100
- question_type = "mcq"
2101
- elif has_narrative:
2102
- question_type = "narrative"
2103
- else:
2104
- question_type = infer_question_type_from_prompt(prompt)
2105
-
2106
- # 2) Extract student text
2107
- student_info = await extract_text_from_upload(student_file)
2108
- student_text = (student_info.get("text") or "").strip()
2109
-
2110
- MIN_WORDS = 3 if question_type == "mcq" else 8
2111
- if len(student_text.split()) < MIN_WORDS:
2112
- >>>>>>> cdb5b148e5facdea1aec264a5b4d0b6293132b6e
2113
  return {
2114
  "student_id": student_id,
2115
  "homework_id": homework_id,
2116
  "sub_institute_id": sub_institute_id,
2117
  "syear": syear,
2118
- <<<<<<< HEAD
2119
  "question_type": final_question_type,
2120
  "student_level": student_level,
2121
  "status": "Unreadable",
@@ -2126,36 +2005,22 @@ async def homework_validate(
2126
  "llm_used": False,
2127
  "question_marks": make_question_marks([]),
2128
  "annotated_pdf": annotated_pdf_filename,
2129
- =======
2130
- "question_type": question_type,
2131
- "student_level": student_level,
2132
- "status": "Unreadable",
2133
- "match_percentage": 0,
2134
- "ai_generated_remark": None,
2135
- "rule_based_remark": "Answer text could not be read clearly. Please upload a clearer file.",
2136
- "student_extracted_text": student_text,
2137
- "llm_used": False,
2138
- >>>>>>> cdb5b148e5facdea1aec264a5b4d0b6293132b6e
2139
  "extraction": {"student": {k: v for k, v in student_info.items() if k != "text"}},
2140
  }
2141
 
2142
  if student_info.get("needs_ocr") and not student_text:
2143
- <<<<<<< HEAD
2144
  # Save annotated PDF even for unreadable (with status shown)
2145
- if is_pdf_submission and original_file_bytes:
2146
  # Show circle mark for scanned PDF that needs OCR
2147
  ocr_result = [{'qid': extract_qid_from_prompt(prompt, erp_row), 'correct': None, 'chosen': 'Needs OCR', 'correct_answer': 'N/A'}]
2148
  annotated_pdf_filename, annotated_pdf_url = save_annotated_pdf(
2149
  original_file_bytes, homework_id, student_id, ocr_result, 0, "Unreadable", student_level
2150
  )
2151
- =======
2152
- >>>>>>> cdb5b148e5facdea1aec264a5b4d0b6293132b6e
2153
  return {
2154
  "student_id": student_id,
2155
  "homework_id": homework_id,
2156
  "sub_institute_id": sub_institute_id,
2157
  "syear": syear,
2158
- <<<<<<< HEAD
2159
  "question_type": final_question_type,
2160
  "student_level": student_level,
2161
  "status": "Unreadable",
@@ -2171,28 +2036,10 @@ async def homework_validate(
2171
 
2172
 
2173
  if final_question_type == "mixed":
2174
- =======
2175
- "question_type": question_type,
2176
- "student_level": student_level,
2177
- "status": "Unreadable",
2178
- "match_percentage": 0,
2179
- "ai_generated_remark": None,
2180
- "rule_based_remark": "This PDF looks scanned. OCR is required (install pdf2image + poppler) or upload a clearer file.",
2181
- "student_extracted_text": student_text,
2182
- "llm_used": False,
2183
- "extraction": {"student": {k: v for k, v in student_info.items() if k != "text"}},
2184
- }
2185
-
2186
- # =========================================================
2187
- # ✅ MIXED QUESTION TYPES CHECK (MCQ + Narrative)
2188
- # =========================================================
2189
- if question_type == "mixed":
2190
- >>>>>>> cdb5b148e5facdea1aec264a5b4d0b6293132b6e
2191
  # Process each question type separately and combine results
2192
  mcq_results = []
2193
  narrative_results = []
2194
 
2195
- <<<<<<< HEAD
2196
  # Extract ALL MCQ answers from student text with question numbers
2197
  student_answers_by_qid = extract_mcq_answers_with_qid(student_text)
2198
 
@@ -2209,21 +2056,11 @@ async def homework_validate(
2209
  if not chosen:
2210
  chosen = extract_mcq_choice(student_text)
2211
 
2212
- =======
2213
- # Extract MCQ answers from student text for each MCQ question
2214
- for q in parsed_questions:
2215
- if q.get('type') == 'mcq':
2216
- # Try to find answer for this specific question in student's text
2217
- # Use the question text to help locate the answer
2218
- q_text = q.get('question', '')
2219
- chosen = extract_mcq_choice(student_text)
2220
- >>>>>>> cdb5b148e5facdea1aec264a5b4d0b6293132b6e
2221
  correct = q.get('correct_answer') or extract_correct_mcq_from_prompt(q.get('question', ''))
2222
 
2223
  if correct and chosen:
2224
  is_correct = (chosen.lower().strip() == correct.lower().strip())
2225
  mcq_results.append({
2226
- <<<<<<< HEAD
2227
  'qid': qid,
2228
  'correct': is_correct,
2229
  'chosen': chosen,
@@ -2238,12 +2075,6 @@ async def homework_validate(
2238
  'chosen': '',
2239
  'correct_answer': correct,
2240
  'unattempted': True
2241
- =======
2242
- 'qid': q.get('qid'),
2243
- 'correct': is_correct,
2244
- 'chosen': chosen,
2245
- 'correct_answer': correct
2246
- >>>>>>> cdb5b148e5facdea1aec264a5b4d0b6293132b6e
2247
  })
2248
 
2249
  # For narrative questions, use AI to generate reference
@@ -2300,7 +2131,6 @@ async def homework_validate(
2300
  except Exception as e:
2301
  narrative_results = {'error': str(e)}
2302
 
2303
- <<<<<<< HEAD
2304
  # Calculate combined score with level-based partial credit for MCQ
2305
  total_mcq = len(mcq_results)
2306
  correct_mcq = sum(1 for r in mcq_results if r.get('correct'))
@@ -2312,12 +2142,6 @@ async def homework_validate(
2312
 
2313
  # Calculate MCQ score based on level (not just binary correct/incorrect)
2314
  mcq_score = (correct_mcq * credit_per_q) / max(1, total_mcq)
2315
- =======
2316
- # Calculate combined score
2317
- total_mcq = len(mcq_results)
2318
- correct_mcq = sum(1 for r in mcq_results if r.get('correct'))
2319
- mcq_score = (correct_mcq / total_mcq * 100) if total_mcq > 0 else 0
2320
- >>>>>>> cdb5b148e5facdea1aec264a5b4d0b6293132b6e
2321
 
2322
  narrative_score = narrative_results.get('match_percentage', 0) if narrative_results else 0
2323
 
@@ -2339,15 +2163,12 @@ async def homework_validate(
2339
  else:
2340
  status = "Needs Review"
2341
 
2342
- <<<<<<< HEAD
2343
  # Save annotated PDF
2344
- if is_pdf_submission and original_file_bytes and mcq_results:
2345
  annotated_pdf_filename, annotated_pdf_url = save_annotated_pdf(
2346
  original_file_bytes, homework_id, student_id, mcq_results, final_score, status, student_level
2347
  )
2348
 
2349
- =======
2350
- >>>>>>> cdb5b148e5facdea1aec264a5b4d0b6293132b6e
2351
  return {
2352
  "student_id": student_id,
2353
  "homework_id": homework_id,
@@ -2357,18 +2178,12 @@ async def homework_validate(
2357
  "student_level": student_level,
2358
  "status": status,
2359
  "match_percentage": final_score,
2360
- <<<<<<< HEAD
2361
  "submission_remarks": None,
2362
  "rule_based_remark": f"MCQ: {correct_mcq}/{total_mcq} correct. Narrative score: {narrative_score}%. (Level: {student_level}, Credit per Q: {credit_per_q}%)",
2363
- =======
2364
- "ai_generated_remark": None,
2365
- "rule_based_remark": f"MCQ: {correct_mcq}/{total_mcq} correct. Narrative score: {narrative_score}%.",
2366
- >>>>>>> cdb5b148e5facdea1aec264a5b4d0b6293132b6e
2367
  "llm_used": bool(narrative_results and 'error' not in narrative_results),
2368
  "student_extracted_text": student_text,
2369
  "mcq_results": mcq_results,
2370
  "narrative_results": narrative_results,
2371
- <<<<<<< HEAD
2372
  "question_marks": make_question_marks(mcq_results),
2373
  "annotated_pdf": annotated_pdf_filename,
2374
  "extraction": {"student": {k: v for k, v in student_info.items() if k != "text"}},
@@ -2466,7 +2281,7 @@ async def homework_validate(
2466
  status = "Verified" if match_percentage >= passing_threshold else "Needs Review"
2467
 
2468
  # Save annotated PDF
2469
- if is_pdf_submission and original_file_bytes:
2470
  annotated_pdf_filename, annotated_pdf_url = save_annotated_pdf(
2471
  original_file_bytes, homework_id, student_id, mcq_results, match_percentage, status, student_level
2472
  )
@@ -2492,7 +2307,7 @@ async def homework_validate(
2492
  else:
2493
  # No correct answers in prompt - return needs review with extracted answers
2494
  # Save annotated PDF with circle mark
2495
- if is_pdf_submission and original_file_bytes:
2496
  no_answer_result = [{'qid': extract_qid_from_prompt(prompt, erp_row), 'correct': None, 'chosen': 'No Answer Key', 'correct_answer': 'N/A'}]
2497
  annotated_pdf_filename, annotated_pdf_url = save_annotated_pdf(
2498
  original_file_bytes, homework_id, student_id, no_answer_result, 0, "Needs Review", student_level
@@ -2520,19 +2335,11 @@ async def homework_validate(
2520
  pass # Will continue to narrative handling
2521
  elif not correct:
2522
  # Save annotated PDF with circle mark
2523
- if is_pdf_submission and original_file_bytes:
2524
  no_correct_result = [{'qid': extract_qid_from_prompt(prompt, erp_row), 'correct': None, 'chosen': 'Not Found', 'correct_answer': 'N/A'}]
2525
  annotated_pdf_filename, annotated_pdf_url = save_annotated_pdf(
2526
  original_file_bytes, homework_id, student_id, no_correct_result, 0, "Needs Review", student_level
2527
  )
2528
- =======
2529
- "extraction": {"student": {k: v for k, v in student_info.items() if k != "text"}},
2530
- }
2531
- correct = extract_correct_mcq_from_prompt(prompt)
2532
- chosen = extract_mcq_choice(student_text)
2533
-
2534
- if not correct:
2535
- >>>>>>> cdb5b148e5facdea1aec264a5b4d0b6293132b6e
2536
  return {
2537
  "student_id": student_id,
2538
  "homework_id": homework_id,
@@ -2542,7 +2349,6 @@ async def homework_validate(
2542
  "student_level": student_level,
2543
  "status": "Needs Review",
2544
  "match_percentage": 0,
2545
- <<<<<<< HEAD
2546
  "submission_remarks": None,
2547
  "rule_based_remark": "MCQ correct option not found in prompt. Include 'Correct: B' or similar in prompt.",
2548
  "student_extracted_text": student_text,
@@ -2554,22 +2360,11 @@ async def homework_validate(
2554
  }
2555
  elif not chosen:
2556
  # Save annotated PDF with circle mark
2557
- if is_pdf_submission and original_file_bytes:
2558
  no_chosen_result = [{'qid': extract_qid_from_prompt(prompt, erp_row), 'correct': None, 'chosen': 'Not Detected', 'correct_answer': correct or 'N/A'}]
2559
  annotated_pdf_filename, annotated_pdf_url = save_annotated_pdf(
2560
  original_file_bytes, homework_id, student_id, no_chosen_result, 0, "Needs Review", student_level
2561
  )
2562
- =======
2563
- "ai_generated_remark": None,
2564
- "rule_based_remark": "MCQ correct option not found in prompt. Include 'Correct: B' or similar in prompt.",
2565
- "student_extracted_text": student_text,
2566
- "llm_used": False,
2567
- "debug": {"correct": correct, "chosen": chosen},
2568
- "extraction": {"student": {k: v for k, v in student_info.items() if k != "text"}},
2569
- }
2570
-
2571
- if not chosen:
2572
- >>>>>>> cdb5b148e5facdea1aec264a5b4d0b6293132b6e
2573
  return {
2574
  "student_id": student_id,
2575
  "homework_id": homework_id,
@@ -2579,24 +2374,16 @@ async def homework_validate(
2579
  "student_level": student_level,
2580
  "status": "Needs Review",
2581
  "match_percentage": 0,
2582
- <<<<<<< HEAD
2583
  "submission_remarks": None,
2584
  "rule_based_remark": "Student option (A/B/C/D) not detected clearly.",
2585
  "student_extracted_text": student_text,
2586
  "llm_used": False,
2587
  "question_marks": make_question_marks([]),
2588
  "annotated_pdf": annotated_pdf_filename,
2589
- =======
2590
- "ai_generated_remark": None,
2591
- "rule_based_remark": "Student option (A/B/C/D) not detected clearly.",
2592
- "student_extracted_text": student_text,
2593
- "llm_used": False,
2594
- >>>>>>> cdb5b148e5facdea1aec264a5b4d0b6293132b6e
2595
  "debug": {"correct": correct, "chosen": chosen},
2596
  "extraction": {"student": {k: v for k, v in student_info.items() if k != "text"}},
2597
  }
2598
 
2599
- <<<<<<< HEAD
2600
  # Only process MCQ validation if not redirecting to narrative
2601
  if not redirect_to_narrative:
2602
  is_correct = (chosen == correct)
@@ -2615,7 +2402,7 @@ async def homework_validate(
2615
  # Save annotated PDF
2616
  _qid = extract_qid_from_prompt(prompt, erp_row)
2617
  mcq_results_single = [{'qid': _qid, 'correct': is_correct, 'chosen': chosen, 'correct_answer': correct}]
2618
- if is_pdf_submission and original_file_bytes:
2619
  annotated_pdf_filename, annotated_pdf_url = save_annotated_pdf(
2620
  original_file_bytes, homework_id, student_id, mcq_results_single, match_percentage, status, student_level
2621
  )
@@ -2642,34 +2429,10 @@ async def homework_validate(
2642
 
2643
  if gemini_client is None:
2644
  # Save annotated PDF
2645
- if is_pdf_submission and original_file_bytes:
2646
  annotated_pdf_filename, annotated_pdf_url = save_annotated_pdf(
2647
  original_file_bytes, homework_id, student_id, [], 0, "Needs Review", student_level
2648
  )
2649
- =======
2650
- is_correct = (chosen == correct)
2651
- return {
2652
- "student_id": student_id,
2653
- "homework_id": homework_id,
2654
- "sub_institute_id": sub_institute_id,
2655
- "syear": syear,
2656
- "question_type": "mcq",
2657
- "student_level": student_level,
2658
- "status": "Verified" if is_correct else "Needs Review",
2659
- "match_percentage": 100 if is_correct else 0,
2660
- "ai_generated_remark": None,
2661
- "rule_based_remark": "Correct." if is_correct else f"Incorrect. Expected {correct.upper()}, got {chosen.upper()}.",
2662
- "student_extracted_text": student_text,
2663
- "llm_used": False,
2664
- "debug": {"correct": correct, "chosen": chosen},
2665
- "extraction": {"student": {k: v for k, v in student_info.items() if k != "text"}},
2666
- }
2667
-
2668
- # =========================================================
2669
- # ✅ NARRATIVE CHECK (Gemini generates reference)
2670
- # =========================================================
2671
- if gemini_client is None:
2672
- >>>>>>> cdb5b148e5facdea1aec264a5b4d0b6293132b6e
2673
  return {
2674
  "student_id": student_id,
2675
  "homework_id": homework_id,
@@ -2679,20 +2442,13 @@ async def homework_validate(
2679
  "student_level": student_level,
2680
  "status": "Needs Review",
2681
  "match_percentage": 0,
2682
- <<<<<<< HEAD
2683
  "submission_remarks": None,
2684
- =======
2685
- "ai_generated_remark": None,
2686
- >>>>>>> cdb5b148e5facdea1aec264a5b4d0b6293132b6e
2687
  "rule_based_remark": "Gemini not configured. Check /health/llm.",
2688
  "llm_used": False,
2689
  "llm_error": parse_gemini_error(GEMINI_LAST_ERROR),
2690
  "student_extracted_text": student_text,
2691
- <<<<<<< HEAD
2692
  "question_marks": make_question_marks([]),
2693
  "annotated_pdf": annotated_pdf_filename,
2694
- =======
2695
- >>>>>>> cdb5b148e5facdea1aec264a5b4d0b6293132b6e
2696
  "extraction": {"student": {k: v for k, v in student_info.items() if k != "text"}},
2697
  }
2698
 
@@ -2713,14 +2469,11 @@ async def homework_validate(
2713
  )
2714
 
2715
  if not response_text:
2716
- <<<<<<< HEAD
2717
  # Save annotated PDF
2718
- if is_pdf_submission and original_file_bytes:
2719
  annotated_pdf_filename, annotated_pdf_url = save_annotated_pdf(
2720
  original_file_bytes, homework_id, student_id, [], 0, "Needs Review", student_level
2721
  )
2722
- =======
2723
- >>>>>>> cdb5b148e5facdea1aec264a5b4d0b6293132b6e
2724
  return {
2725
  "student_id": student_id,
2726
  "homework_id": homework_id,
@@ -2730,20 +2483,13 @@ async def homework_validate(
2730
  "student_level": student_level,
2731
  "status": "Needs Review",
2732
  "match_percentage": 0,
2733
- <<<<<<< HEAD
2734
  "submission_remarks": None,
2735
- =======
2736
- "ai_generated_remark": None,
2737
- >>>>>>> cdb5b148e5facdea1aec264a5b4d0b6293132b6e
2738
  "rule_based_remark": "Gemini failed. Check /health/llm.",
2739
  "llm_used": False,
2740
  "llm_error": parse_gemini_error(GEMINI_LAST_ERROR),
2741
  "student_extracted_text": student_text,
2742
- <<<<<<< HEAD
2743
  "question_marks": make_question_marks([]),
2744
  "annotated_pdf": annotated_pdf_filename,
2745
- =======
2746
- >>>>>>> cdb5b148e5facdea1aec264a5b4d0b6293132b6e
2747
  "extraction": {"student": {k: v for k, v in student_info.items() if k != "text"}},
2748
  }
2749
 
@@ -2751,14 +2497,11 @@ async def homework_validate(
2751
  m = re.search(r"\{.*\}", response_text, flags=re.S)
2752
  payload = json.loads(m.group(0) if m else response_text)
2753
  except Exception as e:
2754
- <<<<<<< HEAD
2755
  # Save annotated PDF
2756
- if is_pdf_submission and original_file_bytes:
2757
  annotated_pdf_filename, annotated_pdf_url = save_annotated_pdf(
2758
  original_file_bytes, homework_id, student_id, [], 0, "Needs Review", student_level
2759
  )
2760
- =======
2761
- >>>>>>> cdb5b148e5facdea1aec264a5b4d0b6293132b6e
2762
  return {
2763
  "student_id": student_id,
2764
  "homework_id": homework_id,
@@ -2768,20 +2511,13 @@ async def homework_validate(
2768
  "student_level": student_level,
2769
  "status": "Needs Review",
2770
  "match_percentage": 0,
2771
- <<<<<<< HEAD
2772
  "submission_remarks": None,
2773
- =======
2774
- "ai_generated_remark": None,
2775
- >>>>>>> cdb5b148e5facdea1aec264a5b4d0b6293132b6e
2776
  "rule_based_remark": "Gemini returned non-JSON output.",
2777
  "llm_used": False,
2778
  "llm_error": {"ok": False, "error_type": "GEMINI_BAD_JSON", "message": str(e), "raw": response_text[:800]},
2779
  "student_extracted_text": student_text,
2780
- <<<<<<< HEAD
2781
  "question_marks": make_question_marks([]),
2782
  "annotated_pdf": annotated_pdf_filename,
2783
- =======
2784
- >>>>>>> cdb5b148e5facdea1aec264a5b4d0b6293132b6e
2785
  "extraction": {"student": {k: v for k, v in student_info.items() if k != "text"}},
2786
  }
2787
 
@@ -2792,14 +2528,11 @@ async def homework_validate(
2792
  key_points = [str(x).strip() for x in key_points if str(x).strip()]
2793
 
2794
  if not ai_reference_answer:
2795
- <<<<<<< HEAD
2796
  # Save annotated PDF
2797
- if is_pdf_submission and original_file_bytes:
2798
  annotated_pdf_filename, annotated_pdf_url = save_annotated_pdf(
2799
  original_file_bytes, homework_id, student_id, [], 0, "Needs Review", student_level
2800
  )
2801
- =======
2802
- >>>>>>> cdb5b148e5facdea1aec264a5b4d0b6293132b6e
2803
  return {
2804
  "student_id": student_id,
2805
  "homework_id": homework_id,
@@ -2809,19 +2542,12 @@ async def homework_validate(
2809
  "student_level": student_level,
2810
  "status": "Needs Review",
2811
  "match_percentage": 0,
2812
- <<<<<<< HEAD
2813
  "submission_remarks": None,
2814
  "rule_based_remark": "AI returned empty reference answer.",
2815
  "llm_used": True,
2816
  "student_extracted_text": student_text,
2817
  "question_marks": make_question_marks([]),
2818
  "annotated_pdf": annotated_pdf_filename,
2819
- =======
2820
- "ai_generated_remark": None,
2821
- "rule_based_remark": "AI returned empty reference answer.",
2822
- "llm_used": True,
2823
- "student_extracted_text": student_text,
2824
- >>>>>>> cdb5b148e5facdea1aec264a5b4d0b6293132b6e
2825
  "extraction": {"student": {k: v for k, v in student_info.items() if k != "text"}},
2826
  }
2827
 
@@ -2852,11 +2578,7 @@ async def homework_validate(
2852
  f"{remark_prompt}"
2853
  )
2854
 
2855
- <<<<<<< HEAD
2856
  submission_remark = generate_gemini_response(
2857
- =======
2858
- ai_generated_remark = generate_gemini_response(
2859
- >>>>>>> cdb5b148e5facdea1aec264a5b4d0b6293132b6e
2860
  prompt=resp2_prompt,
2861
  system_prompt="You are a strict, helpful teacher. Be concise and factual.",
2862
  max_tokens=140,
@@ -2864,17 +2586,10 @@ async def homework_validate(
2864
  )
2865
 
2866
  rule_based_remark = None
2867
- <<<<<<< HEAD
2868
  remark_llm_used = bool(submission_remark)
2869
  remark_llm_error = None if submission_remark else (GEMINI_LAST_ERROR or "Unknown LLM error")
2870
 
2871
  if not submission_remark:
2872
- =======
2873
- remark_llm_used = bool(ai_generated_remark)
2874
- remark_llm_error = None if ai_generated_remark else (GEMINI_LAST_ERROR or "Unknown LLM error")
2875
-
2876
- if not ai_generated_remark:
2877
- >>>>>>> cdb5b148e5facdea1aec264a5b4d0b6293132b6e
2878
  if status == "Verified":
2879
  rule_based_remark = "Homework matches the expected answer well. Good coverage of the key ideas."
2880
  elif status == "Partial":
@@ -2882,7 +2597,6 @@ async def homework_validate(
2882
  else:
2883
  rule_based_remark = "Homework does not match the expected answer enough. Please review the topic and resubmit with clearer, complete points."
2884
 
2885
- <<<<<<< HEAD
2886
  # Save annotated PDF — evaluate EACH question individually against student text
2887
  per_question_results = build_per_question_results(
2888
  prompt, student_text, status, match_pct,
@@ -2891,13 +2605,11 @@ async def homework_validate(
2891
  policy=policy,
2892
  student_level=student_level,
2893
  )
2894
- if is_pdf_submission and original_file_bytes:
2895
  annotated_pdf_filename, annotated_pdf_url = save_annotated_pdf(
2896
  original_file_bytes, homework_id, student_id, per_question_results, match_pct, status, student_level, "narrative"
2897
  )
2898
 
2899
- =======
2900
- >>>>>>> cdb5b148e5facdea1aec264a5b4d0b6293132b6e
2901
  return {
2902
  "student_id": student_id,
2903
  "homework_id": homework_id,
@@ -2907,11 +2619,7 @@ async def homework_validate(
2907
  "student_level": student_level,
2908
  "status": status,
2909
  "match_percentage": match_pct,
2910
- <<<<<<< HEAD
2911
  "submission_remarks": submission_remark if submission_remark else None,
2912
- =======
2913
- "ai_generated_remark": ai_generated_remark if ai_generated_remark else None,
2914
- >>>>>>> cdb5b148e5facdea1aec264a5b4d0b6293132b6e
2915
  "rule_based_remark": rule_based_remark,
2916
  "llm_used": True,
2917
  "remark_llm_used": remark_llm_used,
@@ -2921,21 +2629,15 @@ async def homework_validate(
2921
  "key_points": key_points,
2922
  "key_points_covered": covered,
2923
  "key_points_missing": missing,
2924
- <<<<<<< HEAD
2925
  "question_marks": make_question_marks(per_question_results),
2926
  "annotated_pdf": annotated_pdf_filename,
2927
- =======
2928
- >>>>>>> cdb5b148e5facdea1aec264a5b4d0b6293132b6e
2929
  "debug": {
2930
  "similarity": sim,
2931
  "coverage": coverage,
2932
  "policy": policy,
2933
- <<<<<<< HEAD
2934
  "per_question_results": per_question_results,
2935
  "erp_row_fields": list(erp_row.keys()) if erp_row else [],
2936
  "erp_student_level_raw": erp_row.get("student_level") or erp_row.get("level") or erp_row.get("difficulty") or erp_row.get("difficulty_level"),
2937
- =======
2938
- >>>>>>> cdb5b148e5facdea1aec264a5b4d0b6293132b6e
2939
  },
2940
  "extraction": {"student": {k: v for k, v in student_info.items() if k != "text"}},
2941
  }
 
10
  from fastapi.middleware.cors import CORSMiddleware
11
  from PIL import Image, ImageOps, ImageFilter
12
  import pytesseract
 
13
  import os
14
 
15
  # Serve static files from outputs directory
16
  from fastapi.staticfiles import StaticFiles
17
  from fastapi.responses import FileResponse
 
 
 
18
  from dotenv import load_dotenv
19
  load_dotenv()
20
 
 
30
  PdfReader = None
31
 
32
  try:
 
33
  from reportlab.pdfgen import canvas
34
  from reportlab.lib.pagesizes import letter
35
  from reportlab.lib import colors
 
40
  print(f"[WARN] reportlab import failed: {e}")
41
 
42
  try:
 
 
43
  from pdf2image import convert_from_bytes # requires poppler
44
  except Exception:
45
  convert_from_bytes = None
 
55
  genai = None
56
  print(f"[WARN] google-genai import failed: {e}")
57
 
 
58
  # ✅ Google Cloud Vision SDK (for better handwritten OCR)
59
  try:
60
  from google.cloud import vision
 
111
  "num_keys": len(GOOGLE_API_KEYS),
112
  "has_openai_key": bool(os.getenv("OPENAI_API_KEY")),
113
  }
 
 
 
 
 
 
 
114
  app.add_middleware(
115
  CORSMiddleware,
116
  allow_origins=["*"],
 
119
  allow_headers=["*"],
120
  )
121
 
 
122
 
123
 
 
 
 
 
 
124
  if os.name == "nt":
125
  pytesseract.pytesseract.tesseract_cmd = r"C:\Program Files\Tesseract-OCR\tesseract.exe"
126
  else:
127
  pytesseract.pytesseract.tesseract_cmd = "/usr/bin/tesseract"
128
 
129
 
 
130
 
 
 
 
 
 
131
  ERP_BASE = os.getenv("ERP_BASE", "https://erp.triz.co.in/lms_data")
132
  STORAGE_BASE = os.getenv("STORAGE_BASE", "https://erp.triz.co.in/storage/student/")
133
  ERP_TOKEN = os.getenv("ERP_TOKEN", "")
134
 
135
 
 
136
  def get_public_base_url() -> str:
137
  """
138
  Returns the public base URL of this server.
 
251
  return
252
 
253
  api_key = GOOGLE_API_KEYS[key_index]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
254
 
255
  if not genai:
256
  GEMINI_LAST_ERROR = "google-genai not installed / import failed"
257
  gemini_client = None
258
  return
259
 
 
260
  if not api_key:
261
  GEMINI_LAST_ERROR = f"GOOGLE_API_KEY_{key_index + 1} not set"
 
 
 
 
262
  gemini_client = None
263
  return
264
 
265
  try:
 
266
  gemini_client = genai.Client(api_key=api_key)
267
  GEMINI_LAST_ERROR = ""
268
  print(f"[INFO] Gemini client initialized with key #{key_index + 1}")
 
 
 
 
 
269
  except Exception as e:
270
  gemini_client = None
271
  GEMINI_LAST_ERROR = str(e)
272
  print(f"[WARN] Gemini init failed: {GEMINI_LAST_ERROR}")
273
 
274
 
 
275
  def _is_rate_limit_error(error_msg: str) -> bool:
276
  """Check if the error is a rate limit error (429) or service unavailable (503)."""
277
  if not error_msg:
 
314
 
315
 
316
  _init_gemini_client(0)
 
 
 
317
 
318
 
319
  def parse_gemini_error(error_msg: str) -> dict:
 
329
  return {"ok": False, "error_type": "GEMINI_ERROR", "message": msg}
330
 
331
 
 
332
 
333
  def extract_qid_from_prompt(prompt: str, erp_row: dict = None) -> str:
334
  """
 
375
  return "Q1"
376
 
377
 
 
 
378
  def generate_gemini_response(
379
  prompt: str,
380
  system_prompt: str = "",
381
  max_tokens: int = 650,
382
  temperature: float = 0.3,
383
  ) -> str:
 
384
  global GEMINI_LAST_ERROR, gemini_client, rate_limited_keys
 
 
 
385
 
386
  if gemini_client is None:
387
  if not GEMINI_LAST_ERROR:
388
  GEMINI_LAST_ERROR = "Gemini client not initialized"
 
389
  # Try to reinitialize if we have keys available
390
  if GOOGLE_API_KEYS and current_key_index not in rate_limited_keys:
391
  _init_gemini_client(current_key_index)
392
  if gemini_client is None:
393
  return ""
 
 
 
394
 
395
  try:
396
  contents = []
 
408
  GEMINI_LAST_ERROR = ""
409
  return text
410
  except Exception as e:
 
411
  error_msg = str(e)
412
  print(f"[ERROR] Gemini call failed: {error_msg}")
413
 
 
419
  return generate_gemini_response(prompt, system_prompt, max_tokens, temperature)
420
 
421
  GEMINI_LAST_ERROR = error_msg
 
 
 
 
422
  return ""
423
 
424
  import time
 
452
  return int(round(min(0.6, overlap) * 100)) # cap at 60
453
 
454
 
 
455
 
 
 
 
 
 
456
  def _norm(s: str) -> str:
457
  return re.sub(r"\s+", " ", (s or "").strip().lower())
458
 
 
487
  return {"w_sim": 0.6, "w_cov": 0.4, "verified": 75, "partial": 55, "kp_thr": 0.20}
488
 
489
 
 
490
  def mcq_partial_credit(student_level: str) -> dict:
491
  """
492
  Returns partial credit percentage for MCQ questions based on student level.
 
507
  return {"credit_per_question": 75, "passing_threshold": 75}
508
 
509
 
 
 
510
  def keypoint_coverage(student_text: str, key_points: List[str], kp_threshold: float) -> Tuple[List[str], List[str], float]:
511
  covered, missing = [], []
512
  for kp in key_points:
 
524
  return covered, missing, coverage
525
 
526
 
 
527
 
528
  def infer_question_type_from_prompt(prompt: str, student_text: str = "") -> str:
 
 
 
 
 
 
529
  p = _norm(prompt)
530
 
531
  # Explicit markers - check for (mcq) first since it's common in parentheses
 
534
  if re.search(r"\btype\s*:\s*narrative\b", p) or re.search(r"\bquestion_type\s*:\s*narrative\b", p):
535
  return "narrative"
536
 
 
537
  # Heuristic: options A/B/C/D exist in prompt -> likely MCQ
538
  if re.search(r"\b(a|b|c|d)\s*[\)\.]\s+", p) or "option a" in p or "option b" in p:
539
  return "mcq"
 
551
  # If answer starts with A. or B. etc.
552
  if re.search(r"^[a-d]\.\s+", s.strip()):
553
  return "mcq"
 
 
 
 
 
554
 
555
  return "narrative"
556
 
 
615
 
616
  # Check for correct answer (for MCQ)
617
  if current_type == 'mcq':
 
618
  # First check: is this line "Correct Answer(s):" with nothing after it?
619
  # If so, we need to look for the answer on the next line
620
  if re.search(r'^correct\s*answer\s*\(?s\)?\s*[:\.]?\s*$', line, re.IGNORECASE):
 
647
  else:
648
  # Try to extract first letter
649
  current_correct = correct_text[0].upper() if correct_text else None
 
 
 
 
 
 
650
 
651
  # Don't forget the last question
652
  if current_q is not None:
 
660
  # If no questions parsed, fall back to old behavior
661
  if not questions:
662
  qtype = infer_question_type_from_prompt(prompt)
 
663
  return [{'qid': extract_qid_from_prompt(prompt), 'type': qtype, 'question': prompt, 'correct_answer': None}]
 
 
 
664
 
665
  return questions
666
 
 
692
  return ""
693
 
694
 
 
695
  def extract_mcq_answers_with_qid(text: str) -> Dict[str, str]:
696
  """
697
  Extract MCQ answers WITH question numbers from student text.
 
747
  return results
748
 
749
 
 
 
750
  def extract_correct_mcq_from_prompt(prompt: str) -> str:
751
  """
752
  This is IMPORTANT:
 
754
  - Correct: B
755
  - Answer: C
756
  - correct_option: D
 
757
  - Correct Answer(s): A. Devdatta
758
  or JSON: {"correct_option":"B"}
759
 
 
762
  - "Correct Answer(s): A. Devdatta"
763
  - "Correct: B"
764
  - "Answer: C"
 
 
 
765
  """
766
  p = (prompt or "").strip()
767
  if not p:
 
778
  except Exception:
779
  pass
780
 
 
781
  # Text prompt support - new format: "Correct Answer(s): A. Devdatta" or "Correct Answer: B"
782
  t = _norm(p)
783
 
 
800
  return m1c.group(1)
801
 
802
  # Pattern 2: "Correct: A" or "Answer: B" (original pattern)
 
 
 
 
803
  m = re.search(r"\b(correct|answer|ans)\s*[:\-]?\s*\(?\s*([a-d])\s*\)?\b", t)
804
  if m:
805
  return m.group(2)
 
807
  return ""
808
 
809
 
 
810
 
 
 
 
 
 
811
  def _erp_get(params: dict) -> list:
812
  headers = {}
813
  if ERP_TOKEN:
 
839
  return "Medium"
840
 
841
 
 
842
 
843
  def _preprocess_for_ocr(img: Image.Image) -> Image.Image:
844
  """
 
907
  return ""
908
 
909
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
910
  def extract_text_from_image(image_bytes: bytes, filename: str = "unknown") -> str:
911
  if not image_bytes or len(image_bytes) < 50:
912
  raise HTTPException(status_code=400, detail=f"Invalid file: '{filename}' - empty/too small")
 
923
  head = image_bytes[:12]
924
  raise HTTPException(status_code=400, detail=f"Invalid image format: '{filename}' (header={head})")
925
 
 
926
  # First try Google Cloud Vision (better for handwriting)
927
  if vision_client:
928
  gv_text = _extract_text_google_vision(image_bytes)
 
930
  return _clean_extracted_text(gv_text)
931
 
932
  # Fallback to Tesseract with improved preprocessing
 
 
933
  try:
934
  img = Image.open(io.BytesIO(image_bytes))
935
  except Exception as e:
 
937
 
938
  img = _preprocess_for_ocr(img)
939
 
 
940
  # Try multiple OCR configurations for better handwritten recognition
941
  ocr_configs = [
942
  "--oem 3 --psm 6", # Default
 
965
  raise HTTPException(status_code=500, detail=f"OCR failed: {e}")
966
 
967
  text = (best_text or "").strip()
 
 
 
 
 
 
 
 
 
 
968
  text = re.sub(r"[ \t]+", " ", text)
969
  return text
970
 
 
1016
  return {"text": extracted, "used_ocr": False, "needs_ocr": True}
1017
  try:
1018
  used_ocr = True
 
1019
  # Higher DPI for better handwritten OCR
1020
  pages = convert_from_bytes(pdf_bytes, dpi=300)
1021
  page_texts = []
 
1041
  if img:
1042
  img = _preprocess_for_ocr(img)
1043
  extracted = pytesseract.image_to_string(img, lang="eng", config="--oem 3 --psm 6") or ""
 
 
 
 
 
 
 
 
 
 
1044
  except Exception as e:
1045
  return {"text": extracted, "used_ocr": used_ocr, "needs_ocr": True, "ocr_error": str(e)}
1046
 
1047
  return {"text": extracted, "used_ocr": used_ocr, "needs_ocr": False}
1048
 
1049
 
 
1050
  def get_question_positions_from_pdf(pdf_bytes: bytes) -> Dict[int, List[Dict]]:
1051
  """
1052
  Detect question number positions in a PDF.
 
1319
  print(f"[ERROR] Failed to create annotated PDF: {e}")
1320
  return original_pdf_bytes
1321
 
 
 
1322
  async def extract_text_from_upload(file: UploadFile) -> Dict[str, Any]:
1323
  filename = getattr(file, "filename", "") or "upload"
1324
  content_type = (getattr(file, "content_type", "") or "").lower()
 
1368
 
1369
 
1370
 
 
1371
 
 
 
 
 
 
1372
  @app.get("/health")
1373
  def health():
1374
  return {"status": "ok"}
 
1377
  @app.get("/health/llm")
1378
  def health_llm():
1379
  return {
 
1380
  "ok": bool(gemini_client) and bool(GOOGLE_API_KEYS),
1381
  "gemini": {
1382
  "sdk_import_ok": genai is not None,
 
1384
  "num_keys_configured": len(GOOGLE_API_KEYS),
1385
  "current_key_index": current_key_index + 1 if GOOGLE_API_KEYS else 0,
1386
  "rate_limited_keys": list(rate_limited_keys),
 
 
 
 
 
 
1387
  "client_ready": gemini_client is not None,
1388
  "model": GEMINI_MODEL,
1389
  "last_error": GEMINI_LAST_ERROR if GEMINI_LAST_ERROR else None,
 
1391
  }
1392
 
1393
 
 
1394
  @app.get("/homework/annotated-url/{homework_id}/{student_id}")
1395
  async def get_annotated_pdf_url(
1396
  homework_id: int,
 
1803
  return ai_evaluate_per_question(prompt, student_text, student_level)
1804
 
1805
 
 
 
1806
  @app.post("/homework/validate")
1807
  async def homework_validate(
1808
  student_id: int = Form(...),
1809
  homework_id: int = Form(...),
 
1810
  student_file: UploadFile = File(...),
1811
  ):
1812
  # 0) Fetch ERP record -> get all fields automatically
 
1836
  if final_question_type not in ("mcq", "narrative", "mixed"):
1837
  final_question_type = infer_question_type_from_prompt(prompt, student_text)
1838
 
 
 
 
 
 
 
 
 
 
 
 
 
1839
  # 1) Infer question_type from prompt automatically (NO EXTRA FIELD)
1840
  # Try to parse mixed questions first
1841
  parsed_questions = parse_questions_from_prompt(prompt)
1842
  has_mcq = any(q.get('type') == 'mcq' for q in parsed_questions)
1843
  has_narrative = any(q.get('type') == 'narrative' for q in parsed_questions)
1844
 
1845
+ # Detect submission kind
1846
+ submission_kind = student_info.get("kind", "") # "pdf", "image", "docx", etc.
1847
+ is_pdf_submission = submission_kind == "pdf"
1848
+ is_image_submission = submission_kind == "image" or submission_kind == "unknown_as_image"
1849
+ is_docx_submission = submission_kind == "docx"
1850
+ can_annotate = is_pdf_submission or is_image_submission or is_docx_submission
1851
+
1852
+ # ── Converters: image/docx → PDF bytes so create_annotated_pdf can process them ──
1853
+ def _image_bytes_to_pdf(img_bytes: bytes) -> bytes:
1854
+ """Wrap a raw image inside a single-page PDF using reportlab."""
1855
+ try:
1856
+ from reportlab.pdfgen import canvas as rl_canvas
1857
+ from reportlab.lib.utils import ImageReader
1858
+ from PIL import Image as PILImage
1859
+ import io as _io
1860
+ img = PILImage.open(_io.BytesIO(img_bytes))
1861
+ iw, ih = img.size
1862
+ buf = _io.BytesIO()
1863
+ c = rl_canvas.Canvas(buf, pagesize=(iw, ih))
1864
+ c.drawImage(ImageReader(img), 0, 0, iw, ih)
1865
+ c.save()
1866
+ buf.seek(0)
1867
+ return buf.read()
1868
+ except Exception as e:
1869
+ print(f"[WARN] _image_bytes_to_pdf failed: {e}")
1870
+ return b""
1871
+
1872
+ def _docx_bytes_to_pdf(docx_bytes: bytes) -> bytes:
1873
+ """
1874
+ Convert DOCX → PDF.
1875
+ Tries LibreOffice (soffice) first — available in most Linux envs.
1876
+ Falls back to building a simple reportlab PDF with the extracted text.
1877
+ """
1878
+ import subprocess, tempfile, shutil, os as _os, io as _io
1879
+ # Try LibreOffice
1880
+ try:
1881
+ with tempfile.TemporaryDirectory() as tmpdir:
1882
+ docx_path = _os.path.join(tmpdir, "input.docx")
1883
+ with open(docx_path, "wb") as f:
1884
+ f.write(docx_bytes)
1885
+ result = subprocess.run(
1886
+ ["soffice", "--headless", "--convert-to", "pdf", "--outdir", tmpdir, docx_path],
1887
+ timeout=30, capture_output=True
1888
+ )
1889
+ pdf_path = docx_path.replace(".docx", ".pdf")
1890
+ if _os.path.exists(pdf_path):
1891
+ with open(pdf_path, "rb") as f:
1892
+ return f.read()
1893
+ except Exception as e:
1894
+ print(f"[WARN] LibreOffice docx→pdf failed: {e}")
1895
+
1896
+ # Fallback: extract text and build a simple PDF with reportlab
1897
+ try:
1898
+ from reportlab.pdfgen import canvas as rl_canvas
1899
+ from reportlab.lib.pagesizes import A4
1900
+ from docx import Document as DocxDoc
1901
+ doc = DocxDoc(_io.BytesIO(docx_bytes))
1902
+ text_lines = [p.text for p in doc.paragraphs if p.text.strip()]
1903
+ buf = _io.BytesIO()
1904
+ page_w, page_h = A4
1905
+ c = rl_canvas.Canvas(buf, pagesize=A4)
1906
+ c.setFont("Helvetica", 11)
1907
+ y = page_h - 50
1908
+ for line in text_lines:
1909
+ # Word-wrap long lines
1910
+ while len(line) > 90:
1911
+ c.drawString(40, y, line[:90])
1912
+ line = line[90:]
1913
+ y -= 16
1914
+ if y < 50:
1915
+ c.showPage()
1916
+ c.setFont("Helvetica", 11)
1917
+ y = page_h - 50
1918
+ c.drawString(40, y, line)
1919
+ y -= 16
1920
+ if y < 50:
1921
+ c.showPage()
1922
+ c.setFont("Helvetica", 11)
1923
+ y = page_h - 50
1924
+ c.save()
1925
+ buf.seek(0)
1926
+ return buf.read()
1927
+ except Exception as e:
1928
+ print(f"[WARN] Fallback docx→pdf failed: {e}")
1929
+ return b""
1930
+
1931
+ def _get_pdf_bytes_for_annotation() -> bytes:
1932
+ """
1933
+ Returns PDF bytes ready for annotation, converting from image/docx if needed.
1934
+ """
1935
+ if is_pdf_submission:
1936
+ return original_file_bytes
1937
+ if is_image_submission:
1938
+ pdf = _image_bytes_to_pdf(original_file_bytes)
1939
+ if pdf:
1940
+ return pdf
1941
+ if is_docx_submission:
1942
+ pdf = _docx_bytes_to_pdf(original_file_bytes)
1943
+ if pdf:
1944
+ return pdf
1945
+ return b""
1946
+
1947
  # Initialize annotated PDF filename
1948
  annotated_pdf_filename = None
1949
  annotated_pdf_url = None
1950
+
1951
  # Function to save annotated PDF — returns (filename, public_url)
1952
  def save_annotated_pdf(pdf_bytes, hw_id, stud_id, results, score, stat, lvl, qtype="mcq"):
1953
  if not pdf_bytes or len(pdf_bytes) < 100:
 
1958
  ts = int(time.time())
1959
  filename = f"marked_{hw_id}_{stud_id}_{ts}.pdf"
1960
  filepath = os.path.join(outputs_dir, filename)
1961
+
1962
+ # Convert image/docx → PDF if needed, then annotate
1963
+ annotation_input = _get_pdf_bytes_for_annotation()
1964
+ if not annotation_input:
1965
+ print(f"[WARN] Could not get PDF bytes for annotation (kind={submission_kind})")
1966
+ return None, None
1967
+
1968
  annotated = create_annotated_pdf(
1969
+ original_pdf_bytes=annotation_input,
1970
  mcq_results=results,
1971
  match_percentage=score,
1972
  status=stat,
1973
  student_level=lvl,
1974
  question_type=qtype
1975
  )
1976
+
1977
  with open(filepath, "wb") as f:
1978
  f.write(annotated)
1979
  return filename, build_pdf_url(filename)
 
1984
  MIN_WORDS = 3 if final_question_type == "mcq" else 8
1985
  if len(student_text.split()) < MIN_WORDS:
1986
  # Save annotated PDF even for unreadable (with status shown)
1987
+ if can_annotate and original_file_bytes:
1988
  # Show circle mark for unreadable
1989
  unreadable_result = [{'qid': extract_qid_from_prompt(prompt, erp_row), 'correct': None, 'chosen': 'Unreadable', 'correct_answer': 'N/A'}]
1990
  annotated_pdf_filename, annotated_pdf_url = save_annotated_pdf(
1991
  original_file_bytes, homework_id, student_id, unreadable_result, 0, "Unreadable", student_level
1992
  )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1993
  return {
1994
  "student_id": student_id,
1995
  "homework_id": homework_id,
1996
  "sub_institute_id": sub_institute_id,
1997
  "syear": syear,
 
1998
  "question_type": final_question_type,
1999
  "student_level": student_level,
2000
  "status": "Unreadable",
 
2005
  "llm_used": False,
2006
  "question_marks": make_question_marks([]),
2007
  "annotated_pdf": annotated_pdf_filename,
 
 
 
 
 
 
 
 
 
 
2008
  "extraction": {"student": {k: v for k, v in student_info.items() if k != "text"}},
2009
  }
2010
 
2011
  if student_info.get("needs_ocr") and not student_text:
 
2012
  # Save annotated PDF even for unreadable (with status shown)
2013
+ if can_annotate and original_file_bytes:
2014
  # Show circle mark for scanned PDF that needs OCR
2015
  ocr_result = [{'qid': extract_qid_from_prompt(prompt, erp_row), 'correct': None, 'chosen': 'Needs OCR', 'correct_answer': 'N/A'}]
2016
  annotated_pdf_filename, annotated_pdf_url = save_annotated_pdf(
2017
  original_file_bytes, homework_id, student_id, ocr_result, 0, "Unreadable", student_level
2018
  )
 
 
2019
  return {
2020
  "student_id": student_id,
2021
  "homework_id": homework_id,
2022
  "sub_institute_id": sub_institute_id,
2023
  "syear": syear,
 
2024
  "question_type": final_question_type,
2025
  "student_level": student_level,
2026
  "status": "Unreadable",
 
2036
 
2037
 
2038
  if final_question_type == "mixed":
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2039
  # Process each question type separately and combine results
2040
  mcq_results = []
2041
  narrative_results = []
2042
 
 
2043
  # Extract ALL MCQ answers from student text with question numbers
2044
  student_answers_by_qid = extract_mcq_answers_with_qid(student_text)
2045
 
 
2056
  if not chosen:
2057
  chosen = extract_mcq_choice(student_text)
2058
 
 
 
 
 
 
 
 
 
 
2059
  correct = q.get('correct_answer') or extract_correct_mcq_from_prompt(q.get('question', ''))
2060
 
2061
  if correct and chosen:
2062
  is_correct = (chosen.lower().strip() == correct.lower().strip())
2063
  mcq_results.append({
 
2064
  'qid': qid,
2065
  'correct': is_correct,
2066
  'chosen': chosen,
 
2075
  'chosen': '',
2076
  'correct_answer': correct,
2077
  'unattempted': True
 
 
 
 
 
 
2078
  })
2079
 
2080
  # For narrative questions, use AI to generate reference
 
2131
  except Exception as e:
2132
  narrative_results = {'error': str(e)}
2133
 
 
2134
  # Calculate combined score with level-based partial credit for MCQ
2135
  total_mcq = len(mcq_results)
2136
  correct_mcq = sum(1 for r in mcq_results if r.get('correct'))
 
2142
 
2143
  # Calculate MCQ score based on level (not just binary correct/incorrect)
2144
  mcq_score = (correct_mcq * credit_per_q) / max(1, total_mcq)
 
 
 
 
 
 
2145
 
2146
  narrative_score = narrative_results.get('match_percentage', 0) if narrative_results else 0
2147
 
 
2163
  else:
2164
  status = "Needs Review"
2165
 
 
2166
  # Save annotated PDF
2167
+ if can_annotate and original_file_bytes and mcq_results:
2168
  annotated_pdf_filename, annotated_pdf_url = save_annotated_pdf(
2169
  original_file_bytes, homework_id, student_id, mcq_results, final_score, status, student_level
2170
  )
2171
 
 
 
2172
  return {
2173
  "student_id": student_id,
2174
  "homework_id": homework_id,
 
2178
  "student_level": student_level,
2179
  "status": status,
2180
  "match_percentage": final_score,
 
2181
  "submission_remarks": None,
2182
  "rule_based_remark": f"MCQ: {correct_mcq}/{total_mcq} correct. Narrative score: {narrative_score}%. (Level: {student_level}, Credit per Q: {credit_per_q}%)",
 
 
 
 
2183
  "llm_used": bool(narrative_results and 'error' not in narrative_results),
2184
  "student_extracted_text": student_text,
2185
  "mcq_results": mcq_results,
2186
  "narrative_results": narrative_results,
 
2187
  "question_marks": make_question_marks(mcq_results),
2188
  "annotated_pdf": annotated_pdf_filename,
2189
  "extraction": {"student": {k: v for k, v in student_info.items() if k != "text"}},
 
2281
  status = "Verified" if match_percentage >= passing_threshold else "Needs Review"
2282
 
2283
  # Save annotated PDF
2284
+ if can_annotate and original_file_bytes:
2285
  annotated_pdf_filename, annotated_pdf_url = save_annotated_pdf(
2286
  original_file_bytes, homework_id, student_id, mcq_results, match_percentage, status, student_level
2287
  )
 
2307
  else:
2308
  # No correct answers in prompt - return needs review with extracted answers
2309
  # Save annotated PDF with circle mark
2310
+ if can_annotate and original_file_bytes:
2311
  no_answer_result = [{'qid': extract_qid_from_prompt(prompt, erp_row), 'correct': None, 'chosen': 'No Answer Key', 'correct_answer': 'N/A'}]
2312
  annotated_pdf_filename, annotated_pdf_url = save_annotated_pdf(
2313
  original_file_bytes, homework_id, student_id, no_answer_result, 0, "Needs Review", student_level
 
2335
  pass # Will continue to narrative handling
2336
  elif not correct:
2337
  # Save annotated PDF with circle mark
2338
+ if can_annotate and original_file_bytes:
2339
  no_correct_result = [{'qid': extract_qid_from_prompt(prompt, erp_row), 'correct': None, 'chosen': 'Not Found', 'correct_answer': 'N/A'}]
2340
  annotated_pdf_filename, annotated_pdf_url = save_annotated_pdf(
2341
  original_file_bytes, homework_id, student_id, no_correct_result, 0, "Needs Review", student_level
2342
  )
 
 
 
 
 
 
 
 
2343
  return {
2344
  "student_id": student_id,
2345
  "homework_id": homework_id,
 
2349
  "student_level": student_level,
2350
  "status": "Needs Review",
2351
  "match_percentage": 0,
 
2352
  "submission_remarks": None,
2353
  "rule_based_remark": "MCQ correct option not found in prompt. Include 'Correct: B' or similar in prompt.",
2354
  "student_extracted_text": student_text,
 
2360
  }
2361
  elif not chosen:
2362
  # Save annotated PDF with circle mark
2363
+ if can_annotate and original_file_bytes:
2364
  no_chosen_result = [{'qid': extract_qid_from_prompt(prompt, erp_row), 'correct': None, 'chosen': 'Not Detected', 'correct_answer': correct or 'N/A'}]
2365
  annotated_pdf_filename, annotated_pdf_url = save_annotated_pdf(
2366
  original_file_bytes, homework_id, student_id, no_chosen_result, 0, "Needs Review", student_level
2367
  )
 
 
 
 
 
 
 
 
 
 
 
2368
  return {
2369
  "student_id": student_id,
2370
  "homework_id": homework_id,
 
2374
  "student_level": student_level,
2375
  "status": "Needs Review",
2376
  "match_percentage": 0,
 
2377
  "submission_remarks": None,
2378
  "rule_based_remark": "Student option (A/B/C/D) not detected clearly.",
2379
  "student_extracted_text": student_text,
2380
  "llm_used": False,
2381
  "question_marks": make_question_marks([]),
2382
  "annotated_pdf": annotated_pdf_filename,
 
 
 
 
 
 
2383
  "debug": {"correct": correct, "chosen": chosen},
2384
  "extraction": {"student": {k: v for k, v in student_info.items() if k != "text"}},
2385
  }
2386
 
 
2387
  # Only process MCQ validation if not redirecting to narrative
2388
  if not redirect_to_narrative:
2389
  is_correct = (chosen == correct)
 
2402
  # Save annotated PDF
2403
  _qid = extract_qid_from_prompt(prompt, erp_row)
2404
  mcq_results_single = [{'qid': _qid, 'correct': is_correct, 'chosen': chosen, 'correct_answer': correct}]
2405
+ if can_annotate and original_file_bytes:
2406
  annotated_pdf_filename, annotated_pdf_url = save_annotated_pdf(
2407
  original_file_bytes, homework_id, student_id, mcq_results_single, match_percentage, status, student_level
2408
  )
 
2429
 
2430
  if gemini_client is None:
2431
  # Save annotated PDF
2432
+ if can_annotate and original_file_bytes:
2433
  annotated_pdf_filename, annotated_pdf_url = save_annotated_pdf(
2434
  original_file_bytes, homework_id, student_id, [], 0, "Needs Review", student_level
2435
  )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2436
  return {
2437
  "student_id": student_id,
2438
  "homework_id": homework_id,
 
2442
  "student_level": student_level,
2443
  "status": "Needs Review",
2444
  "match_percentage": 0,
 
2445
  "submission_remarks": None,
 
 
 
2446
  "rule_based_remark": "Gemini not configured. Check /health/llm.",
2447
  "llm_used": False,
2448
  "llm_error": parse_gemini_error(GEMINI_LAST_ERROR),
2449
  "student_extracted_text": student_text,
 
2450
  "question_marks": make_question_marks([]),
2451
  "annotated_pdf": annotated_pdf_filename,
 
 
2452
  "extraction": {"student": {k: v for k, v in student_info.items() if k != "text"}},
2453
  }
2454
 
 
2469
  )
2470
 
2471
  if not response_text:
 
2472
  # Save annotated PDF
2473
+ if can_annotate and original_file_bytes:
2474
  annotated_pdf_filename, annotated_pdf_url = save_annotated_pdf(
2475
  original_file_bytes, homework_id, student_id, [], 0, "Needs Review", student_level
2476
  )
 
 
2477
  return {
2478
  "student_id": student_id,
2479
  "homework_id": homework_id,
 
2483
  "student_level": student_level,
2484
  "status": "Needs Review",
2485
  "match_percentage": 0,
 
2486
  "submission_remarks": None,
 
 
 
2487
  "rule_based_remark": "Gemini failed. Check /health/llm.",
2488
  "llm_used": False,
2489
  "llm_error": parse_gemini_error(GEMINI_LAST_ERROR),
2490
  "student_extracted_text": student_text,
 
2491
  "question_marks": make_question_marks([]),
2492
  "annotated_pdf": annotated_pdf_filename,
 
 
2493
  "extraction": {"student": {k: v for k, v in student_info.items() if k != "text"}},
2494
  }
2495
 
 
2497
  m = re.search(r"\{.*\}", response_text, flags=re.S)
2498
  payload = json.loads(m.group(0) if m else response_text)
2499
  except Exception as e:
 
2500
  # Save annotated PDF
2501
+ if can_annotate and original_file_bytes:
2502
  annotated_pdf_filename, annotated_pdf_url = save_annotated_pdf(
2503
  original_file_bytes, homework_id, student_id, [], 0, "Needs Review", student_level
2504
  )
 
 
2505
  return {
2506
  "student_id": student_id,
2507
  "homework_id": homework_id,
 
2511
  "student_level": student_level,
2512
  "status": "Needs Review",
2513
  "match_percentage": 0,
 
2514
  "submission_remarks": None,
 
 
 
2515
  "rule_based_remark": "Gemini returned non-JSON output.",
2516
  "llm_used": False,
2517
  "llm_error": {"ok": False, "error_type": "GEMINI_BAD_JSON", "message": str(e), "raw": response_text[:800]},
2518
  "student_extracted_text": student_text,
 
2519
  "question_marks": make_question_marks([]),
2520
  "annotated_pdf": annotated_pdf_filename,
 
 
2521
  "extraction": {"student": {k: v for k, v in student_info.items() if k != "text"}},
2522
  }
2523
 
 
2528
  key_points = [str(x).strip() for x in key_points if str(x).strip()]
2529
 
2530
  if not ai_reference_answer:
 
2531
  # Save annotated PDF
2532
+ if can_annotate and original_file_bytes:
2533
  annotated_pdf_filename, annotated_pdf_url = save_annotated_pdf(
2534
  original_file_bytes, homework_id, student_id, [], 0, "Needs Review", student_level
2535
  )
 
 
2536
  return {
2537
  "student_id": student_id,
2538
  "homework_id": homework_id,
 
2542
  "student_level": student_level,
2543
  "status": "Needs Review",
2544
  "match_percentage": 0,
 
2545
  "submission_remarks": None,
2546
  "rule_based_remark": "AI returned empty reference answer.",
2547
  "llm_used": True,
2548
  "student_extracted_text": student_text,
2549
  "question_marks": make_question_marks([]),
2550
  "annotated_pdf": annotated_pdf_filename,
 
 
 
 
 
 
2551
  "extraction": {"student": {k: v for k, v in student_info.items() if k != "text"}},
2552
  }
2553
 
 
2578
  f"{remark_prompt}"
2579
  )
2580
 
 
2581
  submission_remark = generate_gemini_response(
 
 
 
2582
  prompt=resp2_prompt,
2583
  system_prompt="You are a strict, helpful teacher. Be concise and factual.",
2584
  max_tokens=140,
 
2586
  )
2587
 
2588
  rule_based_remark = None
 
2589
  remark_llm_used = bool(submission_remark)
2590
  remark_llm_error = None if submission_remark else (GEMINI_LAST_ERROR or "Unknown LLM error")
2591
 
2592
  if not submission_remark:
 
 
 
 
 
 
2593
  if status == "Verified":
2594
  rule_based_remark = "Homework matches the expected answer well. Good coverage of the key ideas."
2595
  elif status == "Partial":
 
2597
  else:
2598
  rule_based_remark = "Homework does not match the expected answer enough. Please review the topic and resubmit with clearer, complete points."
2599
 
 
2600
  # Save annotated PDF — evaluate EACH question individually against student text
2601
  per_question_results = build_per_question_results(
2602
  prompt, student_text, status, match_pct,
 
2605
  policy=policy,
2606
  student_level=student_level,
2607
  )
2608
+ if can_annotate and original_file_bytes:
2609
  annotated_pdf_filename, annotated_pdf_url = save_annotated_pdf(
2610
  original_file_bytes, homework_id, student_id, per_question_results, match_pct, status, student_level, "narrative"
2611
  )
2612
 
 
 
2613
  return {
2614
  "student_id": student_id,
2615
  "homework_id": homework_id,
 
2619
  "student_level": student_level,
2620
  "status": status,
2621
  "match_percentage": match_pct,
 
2622
  "submission_remarks": submission_remark if submission_remark else None,
 
 
 
2623
  "rule_based_remark": rule_based_remark,
2624
  "llm_used": True,
2625
  "remark_llm_used": remark_llm_used,
 
2629
  "key_points": key_points,
2630
  "key_points_covered": covered,
2631
  "key_points_missing": missing,
 
2632
  "question_marks": make_question_marks(per_question_results),
2633
  "annotated_pdf": annotated_pdf_filename,
 
 
2634
  "debug": {
2635
  "similarity": sim,
2636
  "coverage": coverage,
2637
  "policy": policy,
 
2638
  "per_question_results": per_question_results,
2639
  "erp_row_fields": list(erp_row.keys()) if erp_row else [],
2640
  "erp_student_level_raw": erp_row.get("student_level") or erp_row.get("level") or erp_row.get("difficulty") or erp_row.get("difficulty_level"),
 
 
2641
  },
2642
  "extraction": {"student": {k: v for k, v in student_info.items() if k != "text"}},
2643
  }