Spaces:

Moncey10
/

homework_validation_system

Running

App Files Files Community

Moncey10 commited on Feb 26

Commit

5fe9776

2 Parent(s): 9087714 5ec9328

Merge GitHub main into local

Browse files

Files changed (8) hide show

.gitignore +4 -1
Dockerfile +23 -0
README.md +92 -1
answer_key.json +60 -0
app.py +403 -0
db.py +3 -0
requirements.txt +10 -0
scholar clone.lnk +0 -0

.gitignore CHANGED Viewed

@@ -18,6 +18,7 @@ answer_key.json
 __pycache__/
 *.pyc
 .env
 myenv/
 venv/
 .venv/
@@ -27,4 +28,6 @@ __pycache__/
 *.pyd
 *.db
 homework.db
-_local_backup/

 __pycache__/
 *.pyc
 .env
+<<<<<<< HEAD
 myenv/
 venv/
 .venv/
 *.pyd
 *.db
 homework.db
+_local_backup/
+=======
+>>>>>>> cdb5b148e5facdea1aec264a5b4d0b6293132b6e

Dockerfile CHANGED Viewed

@@ -1,3 +1,4 @@
 FROM python:3.10-slim
 # Install Tesseract OCR + Poppler + dependencies
@@ -21,3 +22,25 @@ ENV HF_SPACE=moncey10-homework-validation-system.hf.space
 EXPOSE 7860
 CMD ["uvicorn", "app:app", "--host", "0.0.0.0", "--port", "7860"]

+<<<<<<< HEAD
 FROM python:3.10-slim
 # Install Tesseract OCR + Poppler + dependencies
 EXPOSE 7860
 CMD ["uvicorn", "app:app", "--host", "0.0.0.0", "--port", "7860"]
+=======
+FROM python:3.10-slim
+# System deps (Tesseract + basic libs for PIL)
+RUN apt-get update && apt-get install -y \
+    tesseract-ocr \
+    libtesseract-dev \
+    libleptonica-dev \
+    poppler-utils \
+    && rm -rf /var/lib/apt/lists/*
+WORKDIR /app
+COPY requirements.txt /app/requirements.txt
+RUN pip install --no-cache-dir -r /app/requirements.txt
+COPY . /app
+# HF Spaces uses 7860
+EXPOSE 7860
+CMD ["uvicorn", "app:app", "--host", "0.0.0.0", "--port", "7860"]
+>>>>>>> cdb5b148e5facdea1aec264a5b4d0b6293132b6e

README.md CHANGED Viewed

@@ -1,6 +1,97 @@
 ---
 title: Homework Validation System
 sdk: docker
 app_port: 7860
 ---
-hello

+<<<<<<< HEAD
 ---
 title: Homework Validation System
 sdk: docker
 app_port: 7860
 ---
+hello
+=======
+---
+title: Homework Validation System
+sdk: docker
+app_port: 7860
+---
+# Homework Validation System (FastAPI)
+A backend API that validates student homework by extracting text from teacher and student files, comparing answers, and generating remarks using rule-based logic and optional AI.
+---
+## Features
+- Upload teacher and student homework files
+- OCR support for images and scanned PDFs
+- Text extraction from PDF and DOCX
+- Similarity matching using TF-IDF + cosine similarity
+- Optional AI-generated remarks (OpenAI / Gemini)
+- FastAPI Swagger documentation
+---
+## Tech Stack
+- FastAPI
+- Python
+- pytesseract
+- Pillow
+- pypdf / pdf2image
+- python-docx
+- scikit-learn
+- OpenAI / Gemini (optional)
+---
+## Project Structure
+---
+homework_validation_system/
+│
+├── app.py
+├── requirements.txt
+├── artifacts/
+├── uploads/
+├── src/
+│ ├── extractors.py
+│ ├── similarity.py
+│ ├── llm_client.py
+│ └── utils.py
+└── README.md
+## Installation
+### 1. Create Virtual Environment
+python -m venv myenv
+### 2. Install Requirements
+pip install -r requirements.txt
+## OCR Setup (Required)
+### Install Tesseract OCR
+This project uses **Tesseract OCR** for extracting text from images and scanned PDFs.
+#### Windows
+1. Download and install Tesseract OCR.
+2. Default installation path:
+3. Add this path in your code:
+pytesseract.pytesseract.tesseract_cmd = r"C:\Program Files\Tesseract-OCR\tesseract.exe"
+### Run API
+uvicorn app:app --reload --host 0.0.0.0 --port 8000
+### Swagger UI:
+http://localhost:8000/docs
+### Example API Response
+{
+  "student_id": 1,
+  "homework_id": 10,
+  "status": "Needs Review",
+  "match_percentage": 72,
+  "teacher_extracted_text": "...",
+  "student_extracted_text": "...",
+  "ai_generated_remark": "Good attempt but missing key points.",
+  "llm_used": true
+}
+>>>>>>> cdb5b148e5facdea1aec264a5b4d0b6293132b6e

answer_key.json CHANGED Viewed

@@ -1,3 +1,4 @@
 {
   "hw01": {
     "questions": [
@@ -55,4 +56,63 @@
       }
     ]
   }
 }

+<<<<<<< HEAD
 {
   "hw01": {
     "questions": [
       }
     ]
   }
+=======
+{
+  "hw01": {
+    "questions": [
+      {
+        "qid": "Q1",
+        "type": "text",
+        "answer": "Artificial Intelligence is the simulation of human intelligence."
+      },
+      {
+        "qid": "Q2",
+        "type": "text",
+        "answer": "Machine Learning is a subset of AI that learns from data."
+      }
+    ]
+  },
+  "hw99": {
+    "questions": [
+      {
+        "qid": "Q1",
+        "type": "text",
+        "answer": "Artificial Intelligence is the simulation of human intelligence."
+      },
+      {
+        "qid": "Q2",
+        "type": "text",
+        "answer": "Machine Learning is a subset of AI that learns from data."
+      }
+    ]
+  },
+  "hw90": {
+    "questions": [
+      {
+        "qid": "Q1",
+        "type": "text",
+        "answer": "Artificial Intelligence is the simulation of human intelligence."
+      },
+      {
+        "qid": "Q2",
+        "type": "text",
+        "answer": "Machine Learning is a subset of AI that learns from data."
+      }
+    ]
+  },
+  "hw15": {
+    "questions": [
+      {
+        "qid": "Q1",
+        "type": "text",
+        "answer": "Artificial Intelligence is the simulation of human intelligence."
+      },
+      {
+        "qid": "Q2",
+        "type": "text",
+        "answer": "Machine Learning is a subset of AI that learns from data."
+      }
+    ]
+  }
+>>>>>>> cdb5b148e5facdea1aec264a5b4d0b6293132b6e
 }

app.py CHANGED Viewed

@@ -10,11 +10,15 @@ from fastapi import FastAPI, UploadFile, File, Form, HTTPException
 from fastapi.middleware.cors import CORSMiddleware
 from PIL import Image, ImageOps, ImageFilter
 import pytesseract
 import os
 # Serve static files from outputs directory
 from fastapi.staticfiles import StaticFiles
 from fastapi.responses import FileResponse
 from dotenv import load_dotenv
 load_dotenv()
@@ -30,6 +34,7 @@ except Exception:
     PdfReader = None
 try:
     from reportlab.pdfgen import canvas
     from reportlab.lib.pagesizes import letter
     from reportlab.lib import colors
@@ -40,6 +45,8 @@ except Exception as e:
     print(f"[WARN] reportlab import failed: {e}")
 try:
     from pdf2image import convert_from_bytes  # requires poppler
 except Exception:
     convert_from_bytes = None
@@ -55,6 +62,7 @@ except Exception as e:
     genai = None
     print(f"[WARN] google-genai import failed: {e}")
 # ✅ Google Cloud Vision SDK (for better handwritten OCR)
 try:
     from google.cloud import vision
@@ -111,6 +119,13 @@ def debug_env():
         "num_keys": len(GOOGLE_API_KEYS),
         "has_openai_key": bool(os.getenv("OPENAI_API_KEY")),
     }
 app.add_middleware(
     CORSMiddleware,
     allow_origins=["*"],
@@ -119,20 +134,33 @@ app.add_middleware(
     allow_headers=["*"],
 )
 if os.name == "nt":
     pytesseract.pytesseract.tesseract_cmd = r"C:\Program Files\Tesseract-OCR\tesseract.exe"
 else:
     pytesseract.pytesseract.tesseract_cmd = "/usr/bin/tesseract"
 ERP_BASE = os.getenv("ERP_BASE", "https://erp.triz.co.in/lms_data")
 STORAGE_BASE = os.getenv("STORAGE_BASE", "https://erp.triz.co.in/storage/student/")
 ERP_TOKEN = os.getenv("ERP_TOKEN", "")
 def get_public_base_url() -> str:
     """
     Returns the public base URL of this server.
@@ -251,27 +279,58 @@ def _init_gemini_client(key_index: int = 0) -> None:
         return
     api_key = GOOGLE_API_KEYS[key_index]
     if not genai:
         GEMINI_LAST_ERROR = "google-genai not installed / import failed"
         gemini_client = None
         return
     if not api_key:
         GEMINI_LAST_ERROR = f"GOOGLE_API_KEY_{key_index + 1} not set"
         gemini_client = None
         return
     try:
         gemini_client = genai.Client(api_key=api_key)
         GEMINI_LAST_ERROR = ""
         print(f"[INFO] Gemini client initialized with key #{key_index + 1}")
     except Exception as e:
         gemini_client = None
         GEMINI_LAST_ERROR = str(e)
         print(f"[WARN] Gemini init failed: {GEMINI_LAST_ERROR}")
 def _is_rate_limit_error(error_msg: str) -> bool:
     """Check if the error is a rate limit error (429) or service unavailable (503)."""
     if not error_msg:
@@ -314,6 +373,9 @@ def _rotate_to_next_key() -> bool:
 _init_gemini_client(0)
 def parse_gemini_error(error_msg: str) -> dict:
@@ -329,6 +391,7 @@ def parse_gemini_error(error_msg: str) -> dict:
     return {"ok": False, "error_type": "GEMINI_ERROR", "message": msg}
 def extract_qid_from_prompt(prompt: str, erp_row: dict = None) -> str:
     """
@@ -375,22 +438,32 @@ def extract_qid_from_prompt(prompt: str, erp_row: dict = None) -> str:
     return "Q1"
 def generate_gemini_response(
     prompt: str,
     system_prompt: str = "",
     max_tokens: int = 650,
     temperature: float = 0.3,
 ) -> str:
     global GEMINI_LAST_ERROR, gemini_client, rate_limited_keys
     if gemini_client is None:
         if not GEMINI_LAST_ERROR:
             GEMINI_LAST_ERROR = "Gemini client not initialized"
         # Try to reinitialize if we have keys available
         if GOOGLE_API_KEYS and current_key_index not in rate_limited_keys:
             _init_gemini_client(current_key_index)
         if gemini_client is None:
             return ""
     try:
         contents = []
@@ -408,6 +481,7 @@ def generate_gemini_response(
             GEMINI_LAST_ERROR = ""
         return text
     except Exception as e:
         error_msg = str(e)
         print(f"[ERROR] Gemini call failed: {error_msg}")
@@ -419,6 +493,10 @@ def generate_gemini_response(
                 return generate_gemini_response(prompt, system_prompt, max_tokens, temperature)
         GEMINI_LAST_ERROR = error_msg
         return ""
 import time
@@ -452,7 +530,13 @@ def cheap_overlap_score(student_text: str, prompt: str) -> int:
     return int(round(min(0.6, overlap) * 100))  # cap at 60
 def _norm(s: str) -> str:
     return re.sub(r"\s+", " ", (s or "").strip().lower())
@@ -487,6 +571,7 @@ def level_policy(student_level: str) -> dict:
     return {"w_sim": 0.6, "w_cov": 0.4, "verified": 75, "partial": 55, "kp_thr": 0.20}
 def mcq_partial_credit(student_level: str) -> dict:
     """
     Returns partial credit percentage for MCQ questions based on student level.
@@ -507,6 +592,8 @@ def mcq_partial_credit(student_level: str) -> dict:
     return {"credit_per_question": 75, "passing_threshold": 75}
 def keypoint_coverage(student_text: str, key_points: List[str], kp_threshold: float) -> Tuple[List[str], List[str], float]:
     covered, missing = [], []
     for kp in key_points:
@@ -524,8 +611,15 @@ def keypoint_coverage(student_text: str, key_points: List[str], kp_threshold: fl
     return covered, missing, coverage
 def infer_question_type_from_prompt(prompt: str, student_text: str = "") -> str:
     p = _norm(prompt)
     # Explicit markers - check for (mcq) first since it's common in parentheses
@@ -534,6 +628,7 @@ def infer_question_type_from_prompt(prompt: str, student_text: str = "") -> str:
     if re.search(r"\btype\s*:\s*narrative\b", p) or re.search(r"\bquestion_type\s*:\s*narrative\b", p):
         return "narrative"
     # Heuristic: options A/B/C/D exist in prompt -> likely MCQ
     if re.search(r"\b(a|b|c|d)\s*[\)\.]\s+", p) or "option a" in p or "option b" in p:
         return "mcq"
@@ -551,6 +646,11 @@ def infer_question_type_from_prompt(prompt: str, student_text: str = "") -> str:
         # If answer starts with A. or B. etc.
         if re.search(r"^[a-d]\.\s+", s.strip()):
             return "mcq"
     return "narrative"
@@ -615,6 +715,7 @@ def parse_questions_from_prompt(prompt: str) -> List[Dict[str, Any]]:
                 # Check for correct answer (for MCQ)
                 if current_type == 'mcq':
                     # First check: is this line "Correct Answer(s):" with nothing after it?
                     # If so, we need to look for the answer on the next line
                     if re.search(r'^correct\s*answer\s*\(?s\)?\s*[:\.]?\s*$', line, re.IGNORECASE):
@@ -647,6 +748,12 @@ def parse_questions_from_prompt(prompt: str) -> List[Dict[str, Any]]:
                         else:
                             # Try to extract first letter
                             current_correct = correct_text[0].upper() if correct_text else None
     # Don't forget the last question
     if current_q is not None:
@@ -660,7 +767,11 @@ def parse_questions_from_prompt(prompt: str) -> List[Dict[str, Any]]:
     # If no questions parsed, fall back to old behavior
     if not questions:
         qtype = infer_question_type_from_prompt(prompt)
         return [{'qid': extract_qid_from_prompt(prompt), 'type': qtype, 'question': prompt, 'correct_answer': None}]
     return questions
@@ -692,6 +803,7 @@ def extract_mcq_choice(text: str) -> str:
     return ""
 def extract_mcq_answers_with_qid(text: str) -> Dict[str, str]:
     """
     Extract MCQ answers WITH question numbers from student text.
@@ -747,6 +859,8 @@ def extract_mcq_answers_with_qid(text: str) -> Dict[str, str]:
     return results
 def extract_correct_mcq_from_prompt(prompt: str) -> str:
     """
     This is IMPORTANT:
@@ -754,6 +868,7 @@ def extract_correct_mcq_from_prompt(prompt: str) -> str:
       - Correct: B
       - Answer: C
       - correct_option: D
       - Correct Answer(s): A. Devdatta
     or JSON: {"correct_option":"B"}
@@ -762,6 +877,9 @@ def extract_correct_mcq_from_prompt(prompt: str) -> str:
       - "Correct Answer(s): A. Devdatta"
       - "Correct: B"
       - "Answer: C"
     """
     p = (prompt or "").strip()
     if not p:
@@ -778,6 +896,7 @@ def extract_correct_mcq_from_prompt(prompt: str) -> str:
         except Exception:
             pass
     # Text prompt support - new format: "Correct Answer(s): A. Devdatta" or "Correct Answer: B"
     t = _norm(p)
@@ -800,6 +919,10 @@ def extract_correct_mcq_from_prompt(prompt: str) -> str:
         return m1c.group(1)
     # Pattern 2: "Correct: A" or "Answer: B" (original pattern)
     m = re.search(r"\b(correct|answer|ans)\s*[:\-]?\s*\(?\s*([a-d])\s*\)?\b", t)
     if m:
         return m.group(2)
@@ -807,7 +930,13 @@ def extract_correct_mcq_from_prompt(prompt: str) -> str:
     return ""
 def _erp_get(params: dict) -> list:
     headers = {}
     if ERP_TOKEN:
@@ -839,6 +968,7 @@ def fetch_student_level_from_erp(row: Dict[str, Any]) -> str:
     return "Medium"
 def _preprocess_for_ocr(img: Image.Image) -> Image.Image:
     """
@@ -907,6 +1037,25 @@ def _extract_text_google_vision(image_bytes: bytes) -> str:
         return ""
 def extract_text_from_image(image_bytes: bytes, filename: str = "unknown") -> str:
     if not image_bytes or len(image_bytes) < 50:
         raise HTTPException(status_code=400, detail=f"Invalid file: '{filename}' - empty/too small")
@@ -923,6 +1072,7 @@ def extract_text_from_image(image_bytes: bytes, filename: str = "unknown") -> st
         head = image_bytes[:12]
         raise HTTPException(status_code=400, detail=f"Invalid image format: '{filename}' (header={head})")
     # First try Google Cloud Vision (better for handwriting)
     if vision_client:
         gv_text = _extract_text_google_vision(image_bytes)
@@ -930,6 +1080,8 @@ def extract_text_from_image(image_bytes: bytes, filename: str = "unknown") -> st
             return _clean_extracted_text(gv_text)
     # Fallback to Tesseract with improved preprocessing
     try:
         img = Image.open(io.BytesIO(image_bytes))
     except Exception as e:
@@ -937,6 +1089,7 @@ def extract_text_from_image(image_bytes: bytes, filename: str = "unknown") -> st
     img = _preprocess_for_ocr(img)
     # Try multiple OCR configurations for better handwritten recognition
     ocr_configs = [
         "--oem 3 --psm 6",  # Default
@@ -965,6 +1118,16 @@ def extract_text_from_image(image_bytes: bytes, filename: str = "unknown") -> st
             raise HTTPException(status_code=500, detail=f"OCR failed: {e}")
     text = (best_text or "").strip()
     text = re.sub(r"[ \t]+", " ", text)
     return text
@@ -1016,6 +1179,7 @@ def extract_text_from_pdf(pdf_bytes: bytes, filename: str = "unknown.pdf") -> Di
             return {"text": extracted, "used_ocr": False, "needs_ocr": True}
         try:
             used_ocr = True
             # Higher DPI for better handwritten OCR
             pages = convert_from_bytes(pdf_bytes, dpi=300)
             page_texts = []
@@ -1041,12 +1205,23 @@ def extract_text_from_pdf(pdf_bytes: bytes, filename: str = "unknown.pdf") -> Di
                 if img:
                     img = _preprocess_for_ocr(img)
                     extracted = pytesseract.image_to_string(img, lang="eng", config="--oem 3 --psm 6") or ""
         except Exception as e:
             return {"text": extracted, "used_ocr": used_ocr, "needs_ocr": True, "ocr_error": str(e)}
     return {"text": extracted, "used_ocr": used_ocr, "needs_ocr": False}
 def get_question_positions_from_pdf(pdf_bytes: bytes) -> Dict[int, List[Dict]]:
     """
     Detect question number positions in a PDF.
@@ -1319,6 +1494,8 @@ def create_annotated_pdf(
         print(f"[ERROR] Failed to create annotated PDF: {e}")
         return original_pdf_bytes
 async def extract_text_from_upload(file: UploadFile) -> Dict[str, Any]:
     filename = getattr(file, "filename", "") or "upload"
     content_type = (getattr(file, "content_type", "") or "").lower()
@@ -1368,7 +1545,13 @@ async def extract_text_from_upload(file: UploadFile) -> Dict[str, Any]:
 @app.get("/health")
 def health():
     return {"status": "ok"}
@@ -1377,6 +1560,7 @@ def health():
 @app.get("/health/llm")
 def health_llm():
     return {
         "ok": bool(gemini_client) and bool(GOOGLE_API_KEYS),
         "gemini": {
             "sdk_import_ok": genai is not None,
@@ -1384,6 +1568,12 @@ def health_llm():
             "num_keys_configured": len(GOOGLE_API_KEYS),
             "current_key_index": current_key_index + 1 if GOOGLE_API_KEYS else 0,
             "rate_limited_keys": list(rate_limited_keys),
             "client_ready": gemini_client is not None,
             "model": GEMINI_MODEL,
             "last_error": GEMINI_LAST_ERROR if GEMINI_LAST_ERROR else None,
@@ -1391,6 +1581,7 @@ def health_llm():
     }
 @app.get("/homework/annotated-url/{homework_id}/{student_id}")
 async def get_annotated_pdf_url(
     homework_id: int,
@@ -1803,10 +1994,13 @@ def build_per_question_results(
     return ai_evaluate_per_question(prompt, student_text, student_level)
 @app.post("/homework/validate")
 async def homework_validate(
     student_id: int = Form(...),
     homework_id: int = Form(...),
     student_file: UploadFile = File(...),
 ):
     # 0) Fetch ERP record -> get all fields automatically
@@ -1836,12 +2030,25 @@ async def homework_validate(
     if final_question_type not in ("mcq", "narrative", "mixed"):
         final_question_type = infer_question_type_from_prompt(prompt, student_text)
     # 1) Infer question_type from prompt automatically (NO EXTRA FIELD)
     # Try to parse mixed questions first
     parsed_questions = parse_questions_from_prompt(prompt)
     has_mcq = any(q.get('type') == 'mcq' for q in parsed_questions)
     has_narrative = any(q.get('type') == 'narrative' for q in parsed_questions)
     # Check if it's a PDF
     is_pdf_submission = student_info.get("kind") == "pdf"
@@ -1885,11 +2092,30 @@ async def homework_validate(
             annotated_pdf_filename, annotated_pdf_url = save_annotated_pdf(
                 original_file_bytes, homework_id, student_id, unreadable_result, 0, "Unreadable", student_level
             )
         return {
             "student_id": student_id,
             "homework_id": homework_id,
             "sub_institute_id": sub_institute_id,
             "syear": syear,
             "question_type": final_question_type,
             "student_level": student_level,
             "status": "Unreadable",
@@ -1900,10 +2126,21 @@ async def homework_validate(
             "llm_used": False,
             "question_marks": make_question_marks([]),
             "annotated_pdf": annotated_pdf_filename,
             "extraction": {"student": {k: v for k, v in student_info.items() if k != "text"}},
         }
     if student_info.get("needs_ocr") and not student_text:
         # Save annotated PDF even for unreadable (with status shown)
         if is_pdf_submission and original_file_bytes:
             # Show circle mark for scanned PDF that needs OCR
@@ -1911,11 +2148,14 @@ async def homework_validate(
             annotated_pdf_filename, annotated_pdf_url = save_annotated_pdf(
                 original_file_bytes, homework_id, student_id, ocr_result, 0, "Unreadable", student_level
             )
         return {
             "student_id": student_id,
             "homework_id": homework_id,
             "sub_institute_id": sub_institute_id,
             "syear": syear,
             "question_type": final_question_type,
             "student_level": student_level,
             "status": "Unreadable",
@@ -1931,10 +2171,28 @@ async def homework_validate(
     if final_question_type == "mixed":
         # Process each question type separately and combine results
         mcq_results = []
         narrative_results = []
         # Extract ALL MCQ answers from student text with question numbers
         student_answers_by_qid = extract_mcq_answers_with_qid(student_text)
@@ -1951,11 +2209,21 @@ async def homework_validate(
                 if not chosen:
                     chosen = extract_mcq_choice(student_text)
                 correct = q.get('correct_answer') or extract_correct_mcq_from_prompt(q.get('question', ''))
                 if correct and chosen:
                     is_correct = (chosen.lower().strip() == correct.lower().strip())
                     mcq_results.append({
                         'qid': qid,
                         'correct': is_correct,
                         'chosen': chosen,
@@ -1970,6 +2238,12 @@ async def homework_validate(
                         'chosen': '',
                         'correct_answer': correct,
                         'unattempted': True
                     })
         # For narrative questions, use AI to generate reference
@@ -2026,6 +2300,7 @@ async def homework_validate(
                 except Exception as e:
                     narrative_results = {'error': str(e)}
         # Calculate combined score with level-based partial credit for MCQ
         total_mcq = len(mcq_results)
         correct_mcq = sum(1 for r in mcq_results if r.get('correct'))
@@ -2037,6 +2312,12 @@ async def homework_validate(
         # Calculate MCQ score based on level (not just binary correct/incorrect)
         mcq_score = (correct_mcq * credit_per_q) / max(1, total_mcq)
         narrative_score = narrative_results.get('match_percentage', 0) if narrative_results else 0
@@ -2058,12 +2339,15 @@ async def homework_validate(
         else:
             status = "Needs Review"
         # Save annotated PDF
         if is_pdf_submission and original_file_bytes and mcq_results:
             annotated_pdf_filename, annotated_pdf_url = save_annotated_pdf(
                 original_file_bytes, homework_id, student_id, mcq_results, final_score, status, student_level
             )
         return {
             "student_id": student_id,
             "homework_id": homework_id,
@@ -2073,12 +2357,18 @@ async def homework_validate(
             "student_level": student_level,
             "status": status,
             "match_percentage": final_score,
             "submission_remarks": None,
             "rule_based_remark": f"MCQ: {correct_mcq}/{total_mcq} correct. Narrative score: {narrative_score}%. (Level: {student_level}, Credit per Q: {credit_per_q}%)",
             "llm_used": bool(narrative_results and 'error' not in narrative_results),
             "student_extracted_text": student_text,
             "mcq_results": mcq_results,
             "narrative_results": narrative_results,
             "question_marks": make_question_marks(mcq_results),
             "annotated_pdf": annotated_pdf_filename,
             "extraction": {"student": {k: v for k, v in student_info.items() if k != "text"}},
@@ -2235,6 +2525,14 @@ async def homework_validate(
                 annotated_pdf_filename, annotated_pdf_url = save_annotated_pdf(
                     original_file_bytes, homework_id, student_id, no_correct_result, 0, "Needs Review", student_level
                 )
             return {
                 "student_id": student_id,
                 "homework_id": homework_id,
@@ -2244,6 +2542,7 @@ async def homework_validate(
                 "student_level": student_level,
                 "status": "Needs Review",
                 "match_percentage": 0,
                 "submission_remarks": None,
                 "rule_based_remark": "MCQ correct option not found in prompt. Include 'Correct: B' or similar in prompt.",
                 "student_extracted_text": student_text,
@@ -2260,6 +2559,17 @@ async def homework_validate(
                 annotated_pdf_filename, annotated_pdf_url = save_annotated_pdf(
                     original_file_bytes, homework_id, student_id, no_chosen_result, 0, "Needs Review", student_level
                 )
             return {
                 "student_id": student_id,
                 "homework_id": homework_id,
@@ -2269,16 +2579,24 @@ async def homework_validate(
                 "student_level": student_level,
                 "status": "Needs Review",
                 "match_percentage": 0,
                 "submission_remarks": None,
                 "rule_based_remark": "Student option (A/B/C/D) not detected clearly.",
                 "student_extracted_text": student_text,
                 "llm_used": False,
                 "question_marks": make_question_marks([]),
             "annotated_pdf": annotated_pdf_filename,
                 "debug": {"correct": correct, "chosen": chosen},
                 "extraction": {"student": {k: v for k, v in student_info.items() if k != "text"}},
             }
         # Only process MCQ validation if not redirecting to narrative
         if not redirect_to_narrative:
             is_correct = (chosen == correct)
@@ -2328,6 +2646,30 @@ async def homework_validate(
             annotated_pdf_filename, annotated_pdf_url = save_annotated_pdf(
                 original_file_bytes, homework_id, student_id, [], 0, "Needs Review", student_level
             )
         return {
             "student_id": student_id,
             "homework_id": homework_id,
@@ -2337,13 +2679,20 @@ async def homework_validate(
             "student_level": student_level,
             "status": "Needs Review",
             "match_percentage": 0,
             "submission_remarks": None,
             "rule_based_remark": "Gemini not configured. Check /health/llm.",
             "llm_used": False,
             "llm_error": parse_gemini_error(GEMINI_LAST_ERROR),
             "student_extracted_text": student_text,
             "question_marks": make_question_marks([]),
             "annotated_pdf": annotated_pdf_filename,
             "extraction": {"student": {k: v for k, v in student_info.items() if k != "text"}},
         }
@@ -2364,11 +2713,14 @@ async def homework_validate(
     )
     if not response_text:
         # Save annotated PDF
         if is_pdf_submission and original_file_bytes:
             annotated_pdf_filename, annotated_pdf_url = save_annotated_pdf(
                 original_file_bytes, homework_id, student_id, [], 0, "Needs Review", student_level
             )
         return {
             "student_id": student_id,
             "homework_id": homework_id,
@@ -2378,13 +2730,20 @@ async def homework_validate(
             "student_level": student_level,
             "status": "Needs Review",
             "match_percentage": 0,
             "submission_remarks": None,
             "rule_based_remark": "Gemini failed. Check /health/llm.",
             "llm_used": False,
             "llm_error": parse_gemini_error(GEMINI_LAST_ERROR),
             "student_extracted_text": student_text,
             "question_marks": make_question_marks([]),
             "annotated_pdf": annotated_pdf_filename,
             "extraction": {"student": {k: v for k, v in student_info.items() if k != "text"}},
         }
@@ -2392,11 +2751,14 @@ async def homework_validate(
         m = re.search(r"\{.*\}", response_text, flags=re.S)
         payload = json.loads(m.group(0) if m else response_text)
     except Exception as e:
         # Save annotated PDF
         if is_pdf_submission and original_file_bytes:
             annotated_pdf_filename, annotated_pdf_url = save_annotated_pdf(
                 original_file_bytes, homework_id, student_id, [], 0, "Needs Review", student_level
             )
         return {
             "student_id": student_id,
             "homework_id": homework_id,
@@ -2406,13 +2768,20 @@ async def homework_validate(
             "student_level": student_level,
             "status": "Needs Review",
             "match_percentage": 0,
             "submission_remarks": None,
             "rule_based_remark": "Gemini returned non-JSON output.",
             "llm_used": False,
             "llm_error": {"ok": False, "error_type": "GEMINI_BAD_JSON", "message": str(e), "raw": response_text[:800]},
             "student_extracted_text": student_text,
             "question_marks": make_question_marks([]),
             "annotated_pdf": annotated_pdf_filename,
             "extraction": {"student": {k: v for k, v in student_info.items() if k != "text"}},
         }
@@ -2423,11 +2792,14 @@ async def homework_validate(
     key_points = [str(x).strip() for x in key_points if str(x).strip()]
     if not ai_reference_answer:
         # Save annotated PDF
         if is_pdf_submission and original_file_bytes:
             annotated_pdf_filename, annotated_pdf_url = save_annotated_pdf(
                 original_file_bytes, homework_id, student_id, [], 0, "Needs Review", student_level
             )
         return {
             "student_id": student_id,
             "homework_id": homework_id,
@@ -2437,12 +2809,19 @@ async def homework_validate(
             "student_level": student_level,
             "status": "Needs Review",
             "match_percentage": 0,
             "submission_remarks": None,
             "rule_based_remark": "AI returned empty reference answer.",
             "llm_used": True,
             "student_extracted_text": student_text,
             "question_marks": make_question_marks([]),
             "annotated_pdf": annotated_pdf_filename,
             "extraction": {"student": {k: v for k, v in student_info.items() if k != "text"}},
         }
@@ -2473,7 +2852,11 @@ async def homework_validate(
         f"{remark_prompt}"
     )
     submission_remark = generate_gemini_response(
         prompt=resp2_prompt,
         system_prompt="You are a strict, helpful teacher. Be concise and factual.",
         max_tokens=140,
@@ -2481,10 +2864,17 @@ async def homework_validate(
     )
     rule_based_remark = None
     remark_llm_used = bool(submission_remark)
     remark_llm_error = None if submission_remark else (GEMINI_LAST_ERROR or "Unknown LLM error")
     if not submission_remark:
         if status == "Verified":
             rule_based_remark = "Homework matches the expected answer well. Good coverage of the key ideas."
         elif status == "Partial":
@@ -2492,6 +2882,7 @@ async def homework_validate(
         else:
             rule_based_remark = "Homework does not match the expected answer enough. Please review the topic and resubmit with clearer, complete points."
     # Save annotated PDF — evaluate EACH question individually against student text
     per_question_results = build_per_question_results(
         prompt, student_text, status, match_pct,
@@ -2505,6 +2896,8 @@ async def homework_validate(
             original_file_bytes, homework_id, student_id, per_question_results, match_pct, status, student_level, "narrative"
         )
     return {
         "student_id": student_id,
         "homework_id": homework_id,
@@ -2514,7 +2907,11 @@ async def homework_validate(
         "student_level": student_level,
         "status": status,
         "match_percentage": match_pct,
         "submission_remarks": submission_remark if submission_remark else None,
         "rule_based_remark": rule_based_remark,
         "llm_used": True,
         "remark_llm_used": remark_llm_used,
@@ -2524,15 +2921,21 @@ async def homework_validate(
         "key_points": key_points,
         "key_points_covered": covered,
         "key_points_missing": missing,
         "question_marks": make_question_marks(per_question_results),
         "annotated_pdf": annotated_pdf_filename,
         "debug": {
             "similarity": sim,
             "coverage": coverage,
             "policy": policy,
             "per_question_results": per_question_results,
             "erp_row_fields": list(erp_row.keys()) if erp_row else [],
             "erp_student_level_raw": erp_row.get("student_level") or erp_row.get("level") or erp_row.get("difficulty") or erp_row.get("difficulty_level"),
         },
         "extraction": {"student": {k: v for k, v in student_info.items() if k != "text"}},
     }

 from fastapi.middleware.cors import CORSMiddleware
 from PIL import Image, ImageOps, ImageFilter
 import pytesseract
+<<<<<<< HEAD
 import os
 # Serve static files from outputs directory
 from fastapi.staticfiles import StaticFiles
 from fastapi.responses import FileResponse
+=======
+>>>>>>> cdb5b148e5facdea1aec264a5b4d0b6293132b6e
 from dotenv import load_dotenv
 load_dotenv()
     PdfReader = None
 try:
+<<<<<<< HEAD
     from reportlab.pdfgen import canvas
     from reportlab.lib.pagesizes import letter
     from reportlab.lib import colors
     print(f"[WARN] reportlab import failed: {e}")
 try:
+=======
+>>>>>>> cdb5b148e5facdea1aec264a5b4d0b6293132b6e
     from pdf2image import convert_from_bytes  # requires poppler
 except Exception:
     convert_from_bytes = None
     genai = None
     print(f"[WARN] google-genai import failed: {e}")
+<<<<<<< HEAD
 # ✅ Google Cloud Vision SDK (for better handwritten OCR)
 try:
     from google.cloud import vision
         "num_keys": len(GOOGLE_API_KEYS),
         "has_openai_key": bool(os.getenv("OPENAI_API_KEY")),
     }
+=======
+# =========================================================
+# ✅ FASTAPI APP INSTANCE
+# =========================================================
+app = FastAPI()
+>>>>>>> cdb5b148e5facdea1aec264a5b4d0b6293132b6e
 app.add_middleware(
     CORSMiddleware,
     allow_origins=["*"],
     allow_headers=["*"],
 )
+<<<<<<< HEAD
+=======
+# =========================================================
+# ✅ TESSERACT PATH
+# =========================================================
+>>>>>>> cdb5b148e5facdea1aec264a5b4d0b6293132b6e
 if os.name == "nt":
     pytesseract.pytesseract.tesseract_cmd = r"C:\Program Files\Tesseract-OCR\tesseract.exe"
 else:
     pytesseract.pytesseract.tesseract_cmd = "/usr/bin/tesseract"
+<<<<<<< HEAD
+=======
+# =========================================================
+# ✅ ERP CONFIG
+# =========================================================
+>>>>>>> cdb5b148e5facdea1aec264a5b4d0b6293132b6e
 ERP_BASE = os.getenv("ERP_BASE", "https://erp.triz.co.in/lms_data")
 STORAGE_BASE = os.getenv("STORAGE_BASE", "https://erp.triz.co.in/storage/student/")
 ERP_TOKEN = os.getenv("ERP_TOKEN", "")
+<<<<<<< HEAD
 def get_public_base_url() -> str:
     """
     Returns the public base URL of this server.
         return
     api_key = GOOGLE_API_KEYS[key_index]
+=======
+# =========================================================
+# ✅ GEMINI CONFIG
+# =========================================================
+GOOGLE_API_KEY = (os.getenv("GOOGLE_API_KEY") or "").strip()
+GEMINI_MODEL = (os.getenv("GEMINI_MODEL", "models/gemini-2.0-flash") or "").strip()
+if GEMINI_MODEL and not GEMINI_MODEL.startswith("models/"):
+    GEMINI_MODEL = "models/" + GEMINI_MODEL
+gemini_client = None
+GEMINI_LAST_ERROR = ""
+def _init_gemini_client() -> None:
+    global gemini_client, GEMINI_LAST_ERROR
+    if gemini_client is not None:
+        return
+>>>>>>> cdb5b148e5facdea1aec264a5b4d0b6293132b6e
     if not genai:
         GEMINI_LAST_ERROR = "google-genai not installed / import failed"
         gemini_client = None
         return
+<<<<<<< HEAD
     if not api_key:
         GEMINI_LAST_ERROR = f"GOOGLE_API_KEY_{key_index + 1} not set"
+=======
+    if not GOOGLE_API_KEY:
+        GEMINI_LAST_ERROR = "GOOGLE_API_KEY not set"
+>>>>>>> cdb5b148e5facdea1aec264a5b4d0b6293132b6e
         gemini_client = None
         return
     try:
+<<<<<<< HEAD
         gemini_client = genai.Client(api_key=api_key)
         GEMINI_LAST_ERROR = ""
         print(f"[INFO] Gemini client initialized with key #{key_index + 1}")
+=======
+        gemini_client = genai.Client(api_key=GOOGLE_API_KEY)
+        GEMINI_LAST_ERROR = ""
+        print("[INFO] Gemini client initialized")
+>>>>>>> cdb5b148e5facdea1aec264a5b4d0b6293132b6e
     except Exception as e:
         gemini_client = None
         GEMINI_LAST_ERROR = str(e)
         print(f"[WARN] Gemini init failed: {GEMINI_LAST_ERROR}")
+<<<<<<< HEAD
 def _is_rate_limit_error(error_msg: str) -> bool:
     """Check if the error is a rate limit error (429) or service unavailable (503)."""
     if not error_msg:
 _init_gemini_client(0)
+=======
+_init_gemini_client()
+>>>>>>> cdb5b148e5facdea1aec264a5b4d0b6293132b6e
 def parse_gemini_error(error_msg: str) -> dict:
     return {"ok": False, "error_type": "GEMINI_ERROR", "message": msg}
+<<<<<<< HEAD
 def extract_qid_from_prompt(prompt: str, erp_row: dict = None) -> str:
     """
     return "Q1"
+=======
+>>>>>>> cdb5b148e5facdea1aec264a5b4d0b6293132b6e
 def generate_gemini_response(
     prompt: str,
     system_prompt: str = "",
     max_tokens: int = 650,
     temperature: float = 0.3,
 ) -> str:
+<<<<<<< HEAD
     global GEMINI_LAST_ERROR, gemini_client, rate_limited_keys
+=======
+    global GEMINI_LAST_ERROR
+>>>>>>> cdb5b148e5facdea1aec264a5b4d0b6293132b6e
     if gemini_client is None:
         if not GEMINI_LAST_ERROR:
             GEMINI_LAST_ERROR = "Gemini client not initialized"
+<<<<<<< HEAD
         # Try to reinitialize if we have keys available
         if GOOGLE_API_KEYS and current_key_index not in rate_limited_keys:
             _init_gemini_client(current_key_index)
         if gemini_client is None:
             return ""
+=======
+        return ""
+>>>>>>> cdb5b148e5facdea1aec264a5b4d0b6293132b6e
     try:
         contents = []
             GEMINI_LAST_ERROR = ""
         return text
     except Exception as e:
+<<<<<<< HEAD
         error_msg = str(e)
         print(f"[ERROR] Gemini call failed: {error_msg}")
                 return generate_gemini_response(prompt, system_prompt, max_tokens, temperature)
         GEMINI_LAST_ERROR = error_msg
+=======
+        GEMINI_LAST_ERROR = str(e)
+        print(f"[ERROR] Gemini call failed: {GEMINI_LAST_ERROR}")
+>>>>>>> cdb5b148e5facdea1aec264a5b4d0b6293132b6e
         return ""
 import time
     return int(round(min(0.6, overlap) * 100))  # cap at 60
+<<<<<<< HEAD
+=======
+# =========================================================
+# ✅ SMALL UTILS
+# =========================================================
+>>>>>>> cdb5b148e5facdea1aec264a5b4d0b6293132b6e
 def _norm(s: str) -> str:
     return re.sub(r"\s+", " ", (s or "").strip().lower())
     return {"w_sim": 0.6, "w_cov": 0.4, "verified": 75, "partial": 55, "kp_thr": 0.20}
+<<<<<<< HEAD
 def mcq_partial_credit(student_level: str) -> dict:
     """
     Returns partial credit percentage for MCQ questions based on student level.
     return {"credit_per_question": 75, "passing_threshold": 75}
+=======
+>>>>>>> cdb5b148e5facdea1aec264a5b4d0b6293132b6e
 def keypoint_coverage(student_text: str, key_points: List[str], kp_threshold: float) -> Tuple[List[str], List[str], float]:
     covered, missing = [], []
     for kp in key_points:
     return covered, missing, coverage
+<<<<<<< HEAD
 def infer_question_type_from_prompt(prompt: str, student_text: str = "") -> str:
+=======
+# =========================================================
+# ✅ QUESTION TYPE INFERENCE + MCQ PARSING
+# =========================================================
+def infer_question_type_from_prompt(prompt: str) -> str:
+>>>>>>> cdb5b148e5facdea1aec264a5b4d0b6293132b6e
     p = _norm(prompt)
     # Explicit markers - check for (mcq) first since it's common in parentheses
     if re.search(r"\btype\s*:\s*narrative\b", p) or re.search(r"\bquestion_type\s*:\s*narrative\b", p):
         return "narrative"
+<<<<<<< HEAD
     # Heuristic: options A/B/C/D exist in prompt -> likely MCQ
     if re.search(r"\b(a|b|c|d)\s*[\)\.]\s+", p) or "option a" in p or "option b" in p:
         return "mcq"
         # If answer starts with A. or B. etc.
         if re.search(r"^[a-d]\.\s+", s.strip()):
             return "mcq"
+=======
+    # Heuristic: options A/B/C/D exist -> likely MCQ
+    if re.search(r"\b(a|b|c|d)\s*[\)\.]\s+", p) or "option a" in p or "option b" in p:
+        return "mcq"
+>>>>>>> cdb5b148e5facdea1aec264a5b4d0b6293132b6e
     return "narrative"
                 # Check for correct answer (for MCQ)
                 if current_type == 'mcq':
+<<<<<<< HEAD
                     # First check: is this line "Correct Answer(s):" with nothing after it?
                     # If so, we need to look for the answer on the next line
                     if re.search(r'^correct\s*answer\s*\(?s\)?\s*[:\.]?\s*$', line, re.IGNORECASE):
                         else:
                             # Try to extract first letter
                             current_correct = correct_text[0].upper() if correct_text else None
+=======
+                    # Look for "Correct Answer(s):" or "Correct:" or "Answer:"
+                    correct_match = re.search(r'(?:Correct\s*(?:Answer)?|Answer)[:.]\s*(?:[A-D]\.?\s*)?(.+)', line, re.IGNORECASE)
+                    if correct_match and not current_correct:
+                        current_correct = correct_match.group(1).strip()
+>>>>>>> cdb5b148e5facdea1aec264a5b4d0b6293132b6e
     # Don't forget the last question
     if current_q is not None:
     # If no questions parsed, fall back to old behavior
     if not questions:
         qtype = infer_question_type_from_prompt(prompt)
+<<<<<<< HEAD
         return [{'qid': extract_qid_from_prompt(prompt), 'type': qtype, 'question': prompt, 'correct_answer': None}]
+=======
+        return [{'qid': 'Q1', 'type': qtype, 'question': prompt, 'correct_answer': None}]
+>>>>>>> cdb5b148e5facdea1aec264a5b4d0b6293132b6e
     return questions
     return ""
+<<<<<<< HEAD
 def extract_mcq_answers_with_qid(text: str) -> Dict[str, str]:
     """
     Extract MCQ answers WITH question numbers from student text.
     return results
+=======
+>>>>>>> cdb5b148e5facdea1aec264a5b4d0b6293132b6e
 def extract_correct_mcq_from_prompt(prompt: str) -> str:
     """
     This is IMPORTANT:
       - Correct: B
       - Answer: C
       - correct_option: D
+<<<<<<< HEAD
       - Correct Answer(s): A. Devdatta
     or JSON: {"correct_option":"B"}
       - "Correct Answer(s): A. Devdatta"
       - "Correct: B"
       - "Answer: C"
+=======
+    or JSON: {"correct_option":"B"}
+>>>>>>> cdb5b148e5facdea1aec264a5b4d0b6293132b6e
     """
     p = (prompt or "").strip()
     if not p:
         except Exception:
             pass
+<<<<<<< HEAD
     # Text prompt support - new format: "Correct Answer(s): A. Devdatta" or "Correct Answer: B"
     t = _norm(p)
         return m1c.group(1)
     # Pattern 2: "Correct: A" or "Answer: B" (original pattern)
+=======
+    # Text prompt support
+    t = _norm(p)
+>>>>>>> cdb5b148e5facdea1aec264a5b4d0b6293132b6e
     m = re.search(r"\b(correct|answer|ans)\s*[:\-]?\s*\(?\s*([a-d])\s*\)?\b", t)
     if m:
         return m.group(2)
     return ""
+<<<<<<< HEAD
+=======
+# =========================================================
+# ✅ ERP HELPERS
+# =========================================================
+>>>>>>> cdb5b148e5facdea1aec264a5b4d0b6293132b6e
 def _erp_get(params: dict) -> list:
     headers = {}
     if ERP_TOKEN:
     return "Medium"
+<<<<<<< HEAD
 def _preprocess_for_ocr(img: Image.Image) -> Image.Image:
     """
         return ""
+=======
+# =========================================================
+# ✅ OCR + TEXT EXTRACTION
+# =========================================================
+def _preprocess_for_ocr(img: Image.Image) -> Image.Image:
+    img = img.convert("L")
+    img = ImageOps.autocontrast(img)
+    w, h = img.size
+    if max(w, h) < 1600:
+        scale = 1600 / max(w, h)
+        img = img.resize((int(w * scale), int(h * scale)))
+    img = img.filter(ImageFilter.SHARPEN)
+    img = img.point(lambda p: 255 if p > 170 else 0)
+    return img
+>>>>>>> cdb5b148e5facdea1aec264a5b4d0b6293132b6e
 def extract_text_from_image(image_bytes: bytes, filename: str = "unknown") -> str:
     if not image_bytes or len(image_bytes) < 50:
         raise HTTPException(status_code=400, detail=f"Invalid file: '{filename}' - empty/too small")
         head = image_bytes[:12]
         raise HTTPException(status_code=400, detail=f"Invalid image format: '{filename}' (header={head})")
+<<<<<<< HEAD
     # First try Google Cloud Vision (better for handwriting)
     if vision_client:
         gv_text = _extract_text_google_vision(image_bytes)
             return _clean_extracted_text(gv_text)
     # Fallback to Tesseract with improved preprocessing
+=======
+>>>>>>> cdb5b148e5facdea1aec264a5b4d0b6293132b6e
     try:
         img = Image.open(io.BytesIO(image_bytes))
     except Exception as e:
     img = _preprocess_for_ocr(img)
+<<<<<<< HEAD
     # Try multiple OCR configurations for better handwritten recognition
     ocr_configs = [
         "--oem 3 --psm 6",  # Default
             raise HTTPException(status_code=500, detail=f"OCR failed: {e}")
     text = (best_text or "").strip()
+=======
+    try:
+        text = pytesseract.image_to_string(img, lang="eng", config="--oem 3 --psm 6")
+    except pytesseract.TesseractNotFoundError:
+        raise HTTPException(status_code=500, detail="Tesseract OCR not found. Install it / fix path.")
+    except Exception as e:
+        raise HTTPException(status_code=500, detail=f"OCR failed: {e}")
+    text = (text or "").strip()
+>>>>>>> cdb5b148e5facdea1aec264a5b4d0b6293132b6e
     text = re.sub(r"[ \t]+", " ", text)
     return text
             return {"text": extracted, "used_ocr": False, "needs_ocr": True}
         try:
             used_ocr = True
+<<<<<<< HEAD
             # Higher DPI for better handwritten OCR
             pages = convert_from_bytes(pdf_bytes, dpi=300)
             page_texts = []
                 if img:
                     img = _preprocess_for_ocr(img)
                     extracted = pytesseract.image_to_string(img, lang="eng", config="--oem 3 --psm 6") or ""
+=======
+            pages = convert_from_bytes(pdf_bytes, dpi=250)
+            page_texts = []
+            for img in pages:
+                img = _preprocess_for_ocr(img)
+                t = pytesseract.image_to_string(img, lang="eng", config="--oem 3 --psm 6") or ""
+                if t.strip():
+                    page_texts.append(t)
+            extracted = _clean_extracted_text("\n\n".join(page_texts))
+>>>>>>> cdb5b148e5facdea1aec264a5b4d0b6293132b6e
         except Exception as e:
             return {"text": extracted, "used_ocr": used_ocr, "needs_ocr": True, "ocr_error": str(e)}
     return {"text": extracted, "used_ocr": used_ocr, "needs_ocr": False}
+<<<<<<< HEAD
 def get_question_positions_from_pdf(pdf_bytes: bytes) -> Dict[int, List[Dict]]:
     """
     Detect question number positions in a PDF.
         print(f"[ERROR] Failed to create annotated PDF: {e}")
         return original_pdf_bytes
+=======
+>>>>>>> cdb5b148e5facdea1aec264a5b4d0b6293132b6e
 async def extract_text_from_upload(file: UploadFile) -> Dict[str, Any]:
     filename = getattr(file, "filename", "") or "upload"
     content_type = (getattr(file, "content_type", "") or "").lower()
+<<<<<<< HEAD
+=======
+# =========================================================
+# ✅ ROUTES
+# =========================================================
+>>>>>>> cdb5b148e5facdea1aec264a5b4d0b6293132b6e
 @app.get("/health")
 def health():
     return {"status": "ok"}
 @app.get("/health/llm")
 def health_llm():
     return {
+<<<<<<< HEAD
         "ok": bool(gemini_client) and bool(GOOGLE_API_KEYS),
         "gemini": {
             "sdk_import_ok": genai is not None,
             "num_keys_configured": len(GOOGLE_API_KEYS),
             "current_key_index": current_key_index + 1 if GOOGLE_API_KEYS else 0,
             "rate_limited_keys": list(rate_limited_keys),
+=======
+        "ok": bool(gemini_client) and bool(GOOGLE_API_KEY),
+        "gemini": {
+            "sdk_import_ok": genai is not None,
+            "configured": bool(GOOGLE_API_KEY),
+>>>>>>> cdb5b148e5facdea1aec264a5b4d0b6293132b6e
             "client_ready": gemini_client is not None,
             "model": GEMINI_MODEL,
             "last_error": GEMINI_LAST_ERROR if GEMINI_LAST_ERROR else None,
     }
+<<<<<<< HEAD
 @app.get("/homework/annotated-url/{homework_id}/{student_id}")
 async def get_annotated_pdf_url(
     homework_id: int,
     return ai_evaluate_per_question(prompt, student_text, student_level)
+=======
+>>>>>>> cdb5b148e5facdea1aec264a5b4d0b6293132b6e
 @app.post("/homework/validate")
 async def homework_validate(
     student_id: int = Form(...),
     homework_id: int = Form(...),
+<<<<<<< HEAD
     student_file: UploadFile = File(...),
 ):
     # 0) Fetch ERP record -> get all fields automatically
     if final_question_type not in ("mcq", "narrative", "mixed"):
         final_question_type = infer_question_type_from_prompt(prompt, student_text)
+=======
+    sub_institute_id: int = Form(...),
+    syear: str = Form(...),
+    prompt: str = Form(...),
+    student_file: UploadFile = File(...),
+):
+    # 0) Fetch ERP record -> get student_level automatically
+    erp_row = fetch_student_record(homework_id, student_id)
+    student_level = fetch_student_level_from_erp(erp_row)
+    policy = level_policy(student_level)
+>>>>>>> cdb5b148e5facdea1aec264a5b4d0b6293132b6e
     # 1) Infer question_type from prompt automatically (NO EXTRA FIELD)
     # Try to parse mixed questions first
     parsed_questions = parse_questions_from_prompt(prompt)
     has_mcq = any(q.get('type') == 'mcq' for q in parsed_questions)
     has_narrative = any(q.get('type') == 'narrative' for q in parsed_questions)
+<<<<<<< HEAD
     # Check if it's a PDF
     is_pdf_submission = student_info.get("kind") == "pdf"
             annotated_pdf_filename, annotated_pdf_url = save_annotated_pdf(
                 original_file_bytes, homework_id, student_id, unreadable_result, 0, "Unreadable", student_level
             )
+=======
+    # Determine overall question type for backwards compatibility
+    if has_mcq and has_narrative:
+        question_type = "mixed"
+    elif has_mcq:
+        question_type = "mcq"
+    elif has_narrative:
+        question_type = "narrative"
+    else:
+        question_type = infer_question_type_from_prompt(prompt)
+    # 2) Extract student text
+    student_info = await extract_text_from_upload(student_file)
+    student_text = (student_info.get("text") or "").strip()
+    MIN_WORDS = 3 if question_type == "mcq" else 8
+    if len(student_text.split()) < MIN_WORDS:
+>>>>>>> cdb5b148e5facdea1aec264a5b4d0b6293132b6e
         return {
             "student_id": student_id,
             "homework_id": homework_id,
             "sub_institute_id": sub_institute_id,
             "syear": syear,
+<<<<<<< HEAD
             "question_type": final_question_type,
             "student_level": student_level,
             "status": "Unreadable",
             "llm_used": False,
             "question_marks": make_question_marks([]),
             "annotated_pdf": annotated_pdf_filename,
+=======
+            "question_type": question_type,
+            "student_level": student_level,
+            "status": "Unreadable",
+            "match_percentage": 0,
+            "ai_generated_remark": None,
+            "rule_based_remark": "Answer text could not be read clearly. Please upload a clearer file.",
+            "student_extracted_text": student_text,
+            "llm_used": False,
+>>>>>>> cdb5b148e5facdea1aec264a5b4d0b6293132b6e
             "extraction": {"student": {k: v for k, v in student_info.items() if k != "text"}},
         }
     if student_info.get("needs_ocr") and not student_text:
+<<<<<<< HEAD
         # Save annotated PDF even for unreadable (with status shown)
         if is_pdf_submission and original_file_bytes:
             # Show circle mark for scanned PDF that needs OCR
             annotated_pdf_filename, annotated_pdf_url = save_annotated_pdf(
                 original_file_bytes, homework_id, student_id, ocr_result, 0, "Unreadable", student_level
             )
+=======
+>>>>>>> cdb5b148e5facdea1aec264a5b4d0b6293132b6e
         return {
             "student_id": student_id,
             "homework_id": homework_id,
             "sub_institute_id": sub_institute_id,
             "syear": syear,
+<<<<<<< HEAD
             "question_type": final_question_type,
             "student_level": student_level,
             "status": "Unreadable",
     if final_question_type == "mixed":
+=======
+            "question_type": question_type,
+            "student_level": student_level,
+            "status": "Unreadable",
+            "match_percentage": 0,
+            "ai_generated_remark": None,
+            "rule_based_remark": "This PDF looks scanned. OCR is required (install pdf2image + poppler) or upload a clearer file.",
+            "student_extracted_text": student_text,
+            "llm_used": False,
+            "extraction": {"student": {k: v for k, v in student_info.items() if k != "text"}},
+        }
+    # =========================================================
+    # ✅ MIXED QUESTION TYPES CHECK (MCQ + Narrative)
+    # =========================================================
+    if question_type == "mixed":
+>>>>>>> cdb5b148e5facdea1aec264a5b4d0b6293132b6e
         # Process each question type separately and combine results
         mcq_results = []
         narrative_results = []
+<<<<<<< HEAD
         # Extract ALL MCQ answers from student text with question numbers
         student_answers_by_qid = extract_mcq_answers_with_qid(student_text)
                 if not chosen:
                     chosen = extract_mcq_choice(student_text)
+=======
+        # Extract MCQ answers from student text for each MCQ question
+        for q in parsed_questions:
+            if q.get('type') == 'mcq':
+                # Try to find answer for this specific question in student's text
+                # Use the question text to help locate the answer
+                q_text = q.get('question', '')
+                chosen = extract_mcq_choice(student_text)
+>>>>>>> cdb5b148e5facdea1aec264a5b4d0b6293132b6e
                 correct = q.get('correct_answer') or extract_correct_mcq_from_prompt(q.get('question', ''))
                 if correct and chosen:
                     is_correct = (chosen.lower().strip() == correct.lower().strip())
                     mcq_results.append({
+<<<<<<< HEAD
                         'qid': qid,
                         'correct': is_correct,
                         'chosen': chosen,
                         'chosen': '',
                         'correct_answer': correct,
                         'unattempted': True
+=======
+                        'qid': q.get('qid'),
+                        'correct': is_correct,
+                        'chosen': chosen,
+                        'correct_answer': correct
+>>>>>>> cdb5b148e5facdea1aec264a5b4d0b6293132b6e
                     })
         # For narrative questions, use AI to generate reference
                 except Exception as e:
                     narrative_results = {'error': str(e)}
+<<<<<<< HEAD
         # Calculate combined score with level-based partial credit for MCQ
         total_mcq = len(mcq_results)
         correct_mcq = sum(1 for r in mcq_results if r.get('correct'))
         # Calculate MCQ score based on level (not just binary correct/incorrect)
         mcq_score = (correct_mcq * credit_per_q) / max(1, total_mcq)
+=======
+        # Calculate combined score
+        total_mcq = len(mcq_results)
+        correct_mcq = sum(1 for r in mcq_results if r.get('correct'))
+        mcq_score = (correct_mcq / total_mcq * 100) if total_mcq > 0 else 0
+>>>>>>> cdb5b148e5facdea1aec264a5b4d0b6293132b6e
         narrative_score = narrative_results.get('match_percentage', 0) if narrative_results else 0
         else:
             status = "Needs Review"
+<<<<<<< HEAD
         # Save annotated PDF
         if is_pdf_submission and original_file_bytes and mcq_results:
             annotated_pdf_filename, annotated_pdf_url = save_annotated_pdf(
                 original_file_bytes, homework_id, student_id, mcq_results, final_score, status, student_level
             )
+=======
+>>>>>>> cdb5b148e5facdea1aec264a5b4d0b6293132b6e
         return {
             "student_id": student_id,
             "homework_id": homework_id,
             "student_level": student_level,
             "status": status,
             "match_percentage": final_score,
+<<<<<<< HEAD
             "submission_remarks": None,
             "rule_based_remark": f"MCQ: {correct_mcq}/{total_mcq} correct. Narrative score: {narrative_score}%. (Level: {student_level}, Credit per Q: {credit_per_q}%)",
+=======
+            "ai_generated_remark": None,
+            "rule_based_remark": f"MCQ: {correct_mcq}/{total_mcq} correct. Narrative score: {narrative_score}%.",
+>>>>>>> cdb5b148e5facdea1aec264a5b4d0b6293132b6e
             "llm_used": bool(narrative_results and 'error' not in narrative_results),
             "student_extracted_text": student_text,
             "mcq_results": mcq_results,
             "narrative_results": narrative_results,
+<<<<<<< HEAD
             "question_marks": make_question_marks(mcq_results),
             "annotated_pdf": annotated_pdf_filename,
             "extraction": {"student": {k: v for k, v in student_info.items() if k != "text"}},
                 annotated_pdf_filename, annotated_pdf_url = save_annotated_pdf(
                     original_file_bytes, homework_id, student_id, no_correct_result, 0, "Needs Review", student_level
                 )
+=======
+            "extraction": {"student": {k: v for k, v in student_info.items() if k != "text"}},
+        }
+        correct = extract_correct_mcq_from_prompt(prompt)
+        chosen = extract_mcq_choice(student_text)
+        if not correct:
+>>>>>>> cdb5b148e5facdea1aec264a5b4d0b6293132b6e
             return {
                 "student_id": student_id,
                 "homework_id": homework_id,
                 "student_level": student_level,
                 "status": "Needs Review",
                 "match_percentage": 0,
+<<<<<<< HEAD
                 "submission_remarks": None,
                 "rule_based_remark": "MCQ correct option not found in prompt. Include 'Correct: B' or similar in prompt.",
                 "student_extracted_text": student_text,
                 annotated_pdf_filename, annotated_pdf_url = save_annotated_pdf(
                     original_file_bytes, homework_id, student_id, no_chosen_result, 0, "Needs Review", student_level
                 )
+=======
+                "ai_generated_remark": None,
+                "rule_based_remark": "MCQ correct option not found in prompt. Include 'Correct: B' or similar in prompt.",
+                "student_extracted_text": student_text,
+                "llm_used": False,
+                "debug": {"correct": correct, "chosen": chosen},
+                "extraction": {"student": {k: v for k, v in student_info.items() if k != "text"}},
+            }
+        if not chosen:
+>>>>>>> cdb5b148e5facdea1aec264a5b4d0b6293132b6e
             return {
                 "student_id": student_id,
                 "homework_id": homework_id,
                 "student_level": student_level,
                 "status": "Needs Review",
                 "match_percentage": 0,
+<<<<<<< HEAD
                 "submission_remarks": None,
                 "rule_based_remark": "Student option (A/B/C/D) not detected clearly.",
                 "student_extracted_text": student_text,
                 "llm_used": False,
                 "question_marks": make_question_marks([]),
             "annotated_pdf": annotated_pdf_filename,
+=======
+                "ai_generated_remark": None,
+                "rule_based_remark": "Student option (A/B/C/D) not detected clearly.",
+                "student_extracted_text": student_text,
+                "llm_used": False,
+>>>>>>> cdb5b148e5facdea1aec264a5b4d0b6293132b6e
                 "debug": {"correct": correct, "chosen": chosen},
                 "extraction": {"student": {k: v for k, v in student_info.items() if k != "text"}},
             }
+<<<<<<< HEAD
         # Only process MCQ validation if not redirecting to narrative
         if not redirect_to_narrative:
             is_correct = (chosen == correct)
             annotated_pdf_filename, annotated_pdf_url = save_annotated_pdf(
                 original_file_bytes, homework_id, student_id, [], 0, "Needs Review", student_level
             )
+=======
+        is_correct = (chosen == correct)
+        return {
+            "student_id": student_id,
+            "homework_id": homework_id,
+            "sub_institute_id": sub_institute_id,
+            "syear": syear,
+            "question_type": "mcq",
+            "student_level": student_level,
+            "status": "Verified" if is_correct else "Needs Review",
+            "match_percentage": 100 if is_correct else 0,
+            "ai_generated_remark": None,
+            "rule_based_remark": "Correct." if is_correct else f"Incorrect. Expected {correct.upper()}, got {chosen.upper()}.",
+            "student_extracted_text": student_text,
+            "llm_used": False,
+            "debug": {"correct": correct, "chosen": chosen},
+            "extraction": {"student": {k: v for k, v in student_info.items() if k != "text"}},
+        }
+    # =========================================================
+    # ✅ NARRATIVE CHECK (Gemini generates reference)
+    # =========================================================
+    if gemini_client is None:
+>>>>>>> cdb5b148e5facdea1aec264a5b4d0b6293132b6e
         return {
             "student_id": student_id,
             "homework_id": homework_id,
             "student_level": student_level,
             "status": "Needs Review",
             "match_percentage": 0,
+<<<<<<< HEAD
             "submission_remarks": None,
+=======
+            "ai_generated_remark": None,
+>>>>>>> cdb5b148e5facdea1aec264a5b4d0b6293132b6e
             "rule_based_remark": "Gemini not configured. Check /health/llm.",
             "llm_used": False,
             "llm_error": parse_gemini_error(GEMINI_LAST_ERROR),
             "student_extracted_text": student_text,
+<<<<<<< HEAD
             "question_marks": make_question_marks([]),
             "annotated_pdf": annotated_pdf_filename,
+=======
+>>>>>>> cdb5b148e5facdea1aec264a5b4d0b6293132b6e
             "extraction": {"student": {k: v for k, v in student_info.items() if k != "text"}},
         }
     )
     if not response_text:
+<<<<<<< HEAD
         # Save annotated PDF
         if is_pdf_submission and original_file_bytes:
             annotated_pdf_filename, annotated_pdf_url = save_annotated_pdf(
                 original_file_bytes, homework_id, student_id, [], 0, "Needs Review", student_level
             )
+=======
+>>>>>>> cdb5b148e5facdea1aec264a5b4d0b6293132b6e
         return {
             "student_id": student_id,
             "homework_id": homework_id,
             "student_level": student_level,
             "status": "Needs Review",
             "match_percentage": 0,
+<<<<<<< HEAD
             "submission_remarks": None,
+=======
+            "ai_generated_remark": None,
+>>>>>>> cdb5b148e5facdea1aec264a5b4d0b6293132b6e
             "rule_based_remark": "Gemini failed. Check /health/llm.",
             "llm_used": False,
             "llm_error": parse_gemini_error(GEMINI_LAST_ERROR),
             "student_extracted_text": student_text,
+<<<<<<< HEAD
             "question_marks": make_question_marks([]),
             "annotated_pdf": annotated_pdf_filename,
+=======
+>>>>>>> cdb5b148e5facdea1aec264a5b4d0b6293132b6e
             "extraction": {"student": {k: v for k, v in student_info.items() if k != "text"}},
         }
         m = re.search(r"\{.*\}", response_text, flags=re.S)
         payload = json.loads(m.group(0) if m else response_text)
     except Exception as e:
+<<<<<<< HEAD
         # Save annotated PDF
         if is_pdf_submission and original_file_bytes:
             annotated_pdf_filename, annotated_pdf_url = save_annotated_pdf(
                 original_file_bytes, homework_id, student_id, [], 0, "Needs Review", student_level
             )
+=======
+>>>>>>> cdb5b148e5facdea1aec264a5b4d0b6293132b6e
         return {
             "student_id": student_id,
             "homework_id": homework_id,
             "student_level": student_level,
             "status": "Needs Review",
             "match_percentage": 0,
+<<<<<<< HEAD
             "submission_remarks": None,
+=======
+            "ai_generated_remark": None,
+>>>>>>> cdb5b148e5facdea1aec264a5b4d0b6293132b6e
             "rule_based_remark": "Gemini returned non-JSON output.",
             "llm_used": False,
             "llm_error": {"ok": False, "error_type": "GEMINI_BAD_JSON", "message": str(e), "raw": response_text[:800]},
             "student_extracted_text": student_text,
+<<<<<<< HEAD
             "question_marks": make_question_marks([]),
             "annotated_pdf": annotated_pdf_filename,
+=======
+>>>>>>> cdb5b148e5facdea1aec264a5b4d0b6293132b6e
             "extraction": {"student": {k: v for k, v in student_info.items() if k != "text"}},
         }
     key_points = [str(x).strip() for x in key_points if str(x).strip()]
     if not ai_reference_answer:
+<<<<<<< HEAD
         # Save annotated PDF
         if is_pdf_submission and original_file_bytes:
             annotated_pdf_filename, annotated_pdf_url = save_annotated_pdf(
                 original_file_bytes, homework_id, student_id, [], 0, "Needs Review", student_level
             )
+=======
+>>>>>>> cdb5b148e5facdea1aec264a5b4d0b6293132b6e
         return {
             "student_id": student_id,
             "homework_id": homework_id,
             "student_level": student_level,
             "status": "Needs Review",
             "match_percentage": 0,
+<<<<<<< HEAD
             "submission_remarks": None,
             "rule_based_remark": "AI returned empty reference answer.",
             "llm_used": True,
             "student_extracted_text": student_text,
             "question_marks": make_question_marks([]),
             "annotated_pdf": annotated_pdf_filename,
+=======
+            "ai_generated_remark": None,
+            "rule_based_remark": "AI returned empty reference answer.",
+            "llm_used": True,
+            "student_extracted_text": student_text,
+>>>>>>> cdb5b148e5facdea1aec264a5b4d0b6293132b6e
             "extraction": {"student": {k: v for k, v in student_info.items() if k != "text"}},
         }
         f"{remark_prompt}"
     )
+<<<<<<< HEAD
     submission_remark = generate_gemini_response(
+=======
+    ai_generated_remark = generate_gemini_response(
+>>>>>>> cdb5b148e5facdea1aec264a5b4d0b6293132b6e
         prompt=resp2_prompt,
         system_prompt="You are a strict, helpful teacher. Be concise and factual.",
         max_tokens=140,
     )
     rule_based_remark = None
+<<<<<<< HEAD
     remark_llm_used = bool(submission_remark)
     remark_llm_error = None if submission_remark else (GEMINI_LAST_ERROR or "Unknown LLM error")
     if not submission_remark:
+=======
+    remark_llm_used = bool(ai_generated_remark)
+    remark_llm_error = None if ai_generated_remark else (GEMINI_LAST_ERROR or "Unknown LLM error")
+    if not ai_generated_remark:
+>>>>>>> cdb5b148e5facdea1aec264a5b4d0b6293132b6e
         if status == "Verified":
             rule_based_remark = "Homework matches the expected answer well. Good coverage of the key ideas."
         elif status == "Partial":
         else:
             rule_based_remark = "Homework does not match the expected answer enough. Please review the topic and resubmit with clearer, complete points."
+<<<<<<< HEAD
     # Save annotated PDF — evaluate EACH question individually against student text
     per_question_results = build_per_question_results(
         prompt, student_text, status, match_pct,
             original_file_bytes, homework_id, student_id, per_question_results, match_pct, status, student_level, "narrative"
         )
+=======
+>>>>>>> cdb5b148e5facdea1aec264a5b4d0b6293132b6e
     return {
         "student_id": student_id,
         "homework_id": homework_id,
         "student_level": student_level,
         "status": status,
         "match_percentage": match_pct,
+<<<<<<< HEAD
         "submission_remarks": submission_remark if submission_remark else None,
+=======
+        "ai_generated_remark": ai_generated_remark if ai_generated_remark else None,
+>>>>>>> cdb5b148e5facdea1aec264a5b4d0b6293132b6e
         "rule_based_remark": rule_based_remark,
         "llm_used": True,
         "remark_llm_used": remark_llm_used,
         "key_points": key_points,
         "key_points_covered": covered,
         "key_points_missing": missing,
+<<<<<<< HEAD
         "question_marks": make_question_marks(per_question_results),
         "annotated_pdf": annotated_pdf_filename,
+=======
+>>>>>>> cdb5b148e5facdea1aec264a5b4d0b6293132b6e
         "debug": {
             "similarity": sim,
             "coverage": coverage,
             "policy": policy,
+<<<<<<< HEAD
             "per_question_results": per_question_results,
             "erp_row_fields": list(erp_row.keys()) if erp_row else [],
             "erp_student_level_raw": erp_row.get("student_level") or erp_row.get("level") or erp_row.get("difficulty") or erp_row.get("difficulty_level"),
+=======
+>>>>>>> cdb5b148e5facdea1aec264a5b4d0b6293132b6e
         },
         "extraction": {"student": {k: v for k, v in student_info.items() if k != "text"}},
     }

db.py CHANGED Viewed

@@ -10,4 +10,7 @@ engine = create_engine(
 SessionLocal = sessionmaker(autocommit=False, autoflush=False, bind=engine)
 Base = declarative_base()

 SessionLocal = sessionmaker(autocommit=False, autoflush=False, bind=engine)
 Base = declarative_base()
+<<<<<<< HEAD
+=======
+>>>>>>> cdb5b148e5facdea1aec264a5b4d0b6293132b6e

requirements.txt CHANGED Viewed

@@ -5,6 +5,7 @@ pillow
 sqlalchemy
 scikit-learn
 requests
 python-docx
 google-genai
 python-dotenv
@@ -15,3 +16,12 @@ google-cloud-vision
 easyocr
 pdf2image
 reportlab

 sqlalchemy
 scikit-learn
 requests
+<<<<<<< HEAD
 python-docx
 google-genai
 python-dotenv
 easyocr
 pdf2image
 reportlab
+=======
+python-multipart
+openai
+google-generativeai
+python-docx
+pypdf
+pdf2image
+python-dotenv
+>>>>>>> cdb5b148e5facdea1aec264a5b4d0b6293132b6e

scholar clone.lnk ADDED Viewed

Binary file (760 Bytes). View file