Spaces:

pavan1221
/

smartsaduvu

Sleeping

App Files Files Community

pavan1221 commited on Mar 14

Commit

524cbae

verified ·

1 Parent(s): 5a011c0

Update app/main.py

Browse files

Files changed (1) hide show

app/main.py +121 -68

app/main.py CHANGED Viewed

@@ -12,8 +12,10 @@ import numpy as np
 from concurrent.futures import ThreadPoolExecutor, as_completed
 from reportlab.lib.pagesizes import A4
 from reportlab.lib import colors
-from reportlab.platypus import SimpleDocTemplate, Paragraph, Spacer, Table, TableStyle
-from reportlab.lib.styles import getSampleStyleSheet
 import faiss
 from sentence_transformers import SentenceTransformer
@@ -109,7 +111,7 @@ def call_llm(prompt, max_tokens=2000):
 def is_scanned_pdf(pdf_path):
     try:
         doc = fitz.open(pdf_path)
-        text_pages = 0
         check_pages = min(5, len(doc))
         for i in range(check_pages):
             text = doc[i].get_text().strip()
@@ -121,13 +123,6 @@ def is_scanned_pdf(pdf_path):
         return False
 def smart_sample_pages(total_pages):
-    """
-    Smart page sampling:
-    - Small  (<50 pages):  read all
-    - Medium (<200 pages): every 2nd page
-    - Large  (200+ pages): every 5th page
-    Always include first and last 5 pages.
-    """
     if total_pages <= 50:
         return list(range(total_pages))
     elif total_pages <= 200:
@@ -148,7 +143,6 @@ def process_page(args):
         doc.close()
         if len(text) > 50 and not is_scanned:
             return page_num, text
-        # OCR fallback
         doc2  = fitz.open(pdf_path)
         page2 = doc2[page_num]
         pix   = page2.get_pixmap(matrix=fitz.Matrix(1.5, 1.5))
@@ -157,7 +151,7 @@ def process_page(args):
         ocr_text = pytesseract.image_to_string(img, lang='eng', config='--psm 6')
         doc2.close()
         return page_num, ocr_text
-    except Exception as e:
         return page_num, ""
 def filter_text(text):
@@ -169,20 +163,17 @@ def filter_text(text):
     return text.strip()
 def extract_text_from_pdf(pdf_path, progress_cb=None):
-    doc         = fitz.open(pdf_path)
-    total_pages = len(doc)
     doc.close()
-    scanned    = is_scanned_pdf(pdf_path)
-    pages_list = smart_sample_pages(total_pages)
     total_sampled = len(pages_list)
     print(f"PDF: {total_pages} pages, scanned={scanned}, sampling {total_sampled} pages")
     results = {}
     done    = [0]
     with ThreadPoolExecutor(max_workers=MAX_WORKERS) as executor:
-        futures = {
-            executor.submit(process_page, (i, pdf_path, scanned)): i
-            for i in pages_list
-        }
         for future in as_completed(futures):
             page_num, text = future.result()
             clean = filter_text(text)
@@ -196,7 +187,7 @@ def extract_text_from_pdf(pdf_path, progress_cb=None):
     print(f"Extracted {len(full_text)} characters from {len(results)} pages")
     return full_text.strip(), scanned
-# ── Smart Chunking with Overlap ────────────────────────────────────────────────
 def split_into_chunks(text, chunk_size=CHUNK_SIZE, overlap=CHUNK_OVERLAP):
     words  = text.split()
     chunks = []
@@ -344,7 +335,6 @@ CONTENT:
                 q['difficulty'] = difficulty
                 q['topic']      = topic
                 q['id']         = str(uuid.uuid4())[:8]
-                # Normalize correct_answer types
                 if q.get('is_multi') and isinstance(q.get('correct_answer'), str):
                     q['correct_answer'] = [q['correct_answer']]
                 if not q.get('is_multi') and isinstance(q.get('correct_answer'), list):
@@ -387,7 +377,6 @@ def evaluate_answer(question, user_answer, question_type):
                 "options":        question.get("options", {}),
                 "score":          4 if is_correct else -1
             }
     elif question_type == "fill":
         correct_ans = question.get("correct_answer", "")
         is_correct  = user_answer.strip().lower() == correct_ans.strip().lower()
@@ -403,7 +392,6 @@ def evaluate_answer(question, user_answer, question_type):
             "options":        {},
             "score":          4 if is_correct else -1
         }
     elif question_type == "long":
         prompt = f"""You are a strict examiner. Grade this answer.
 Question: {question['question']}
@@ -482,7 +470,6 @@ def process_pdfs_background(job_id, file_paths, file_names, question_type, diffi
         all_text     = ""
         scanned_warn = False
-        # ── Step 1: Extract text from all PDFs in parallel ──
         progress_cb(5, "Extracting text from PDFs...")
         pdf_results = {}
@@ -510,21 +497,17 @@ def process_pdfs_background(job_id, file_paths, file_names, question_type, diffi
         print(f"Total combined text: {len(all_text)} chars")
-        # ── Step 2: Chunking ────────────────────────────────
         progress_cb(38, "Splitting into chunks...")
         all_chunks = split_into_chunks(all_text)
         print(f"Total chunks: {len(all_chunks)}")
-        # ── Step 3: FAISS index ─────────────────────────────
         progress_cb(42, "Building vector index...")
         faiss_index = build_faiss_index(all_chunks, progress_cb)
-        # ── Step 4: Topic detection ─────────────────────────
         progress_cb(67, "Detecting topics...")
         topics = detect_topics(all_text[:6000])
         progress_cb(70, f"Found {len(topics)} topics: {', '.join(topics[:3])}...")
-        # ── Step 5: Generate questions per topic ────────────
         all_questions = []
         qpt       = max(1, count // len(topics))
         remainder = count - (qpt * len(topics))
@@ -550,7 +533,6 @@ def process_pdfs_background(job_id, file_paths, file_names, question_type, diffi
             job_store[job_id]["message"] = "No questions could be generated. Try different settings."
             return
-        # ── Step 6: Create session ──────────────────────────
         progress_cb(97, "Setting up your exam...")
         sid = create_session(all_questions, exam_mode=exam_mode, time_limit=time_limit)
         sessions[sid]["sources"]      = file_names
@@ -673,7 +655,6 @@ async def answer(
     if not question:
         return {"error": "Question not found"}
-    # Prevent double counting if user navigates back
     if question_id in session["answers"]:
         return {"status": "already_answered", **session["answers"][question_id]}
@@ -779,46 +760,118 @@ async def export(session_id: str):
     session = sessions.get(session_id)
     if not session:
         return JSONResponse({"error": "Session not found"})
-    total = len(session["questions"])
-    path  = f"/tmp/results_{session_id}.pdf"
-    doc   = SimpleDocTemplate(path, pagesize=A4)
-    styles = getSampleStyleSheet()
-    story  = []
-    story.append(Paragraph("Exam Results Report", styles["Title"]))
-    story.append(Spacer(1, 12))
-    summary_data = [
-        ["Total Questions", str(total)],
-        ["Attempted",       str(session["total_attempted"])],
-        ["Correct",         str(session["correct"])],
-        ["Wrong",           str(session["wrong"])],
-        ["Skipped",         str(session["skipped"])],
-        ["Final Score",     f"{session['score']} / {total * 4}"],
-        ["Percentage",      f"{round((session['correct']/total)*100,1) if total>0 else 0}%"],
-    ]
-    table = Table(summary_data, colWidths=[200, 200])
-    table.setStyle(TableStyle([
-        ("BACKGROUND",     (0, 0), (-1, 0), colors.HexColor("#2563eb")),
-        ("TEXTCOLOR",      (0, 0), (-1, 0), colors.whitesmoke),
-        ("ALIGN",          (0, 0), (-1, -1), "CENTER"),
-        ("FONTSIZE",       (0, 0), (-1, -1), 12),
-        ("ROWBACKGROUNDS", (0, 0), (-1, -1), [colors.HexColor("#eff6ff"), colors.white]),
-        ("GRID",           (0, 0), (-1, -1), 1, colors.HexColor("#d0ddef"))
-    ]))
-    story.append(table)
-    story.append(Spacer(1, 20))
-    story.append(Paragraph("Question Review", styles["Heading2"]))
-    story.append(Spacer(1, 10))
     for i, item in enumerate(session["history"]):
-        status = "✓ Correct" if item["correct"] else ("⊘ Skipped" if not item.get("user_answer") else "✗ Wrong")
-        story.append(Paragraph(f"Q{i+1}: {item['question']} [{status}]", styles["Normal"]))
-        if item.get("user_answer"):
-            story.append(Paragraph(f"Your Answer: {item['user_answer']}", styles["Normal"]))
-        story.append(Paragraph(f"Correct Answer: {item['correct_answer']}", styles["Normal"]))
         if item.get("explanation"):
-            story.append(Paragraph(f"Explanation: {item['explanation']}", styles["Normal"]))
-        story.append(Spacer(1, 8))
     doc.build(story)
-    return FileResponse(path, filename=f"results_{session_id}.pdf", media_type="application/pdf")
 @app.get("/health")
 async def health():

 from concurrent.futures import ThreadPoolExecutor, as_completed
 from reportlab.lib.pagesizes import A4
 from reportlab.lib import colors
+from reportlab.lib.units import mm
+from reportlab.platypus import SimpleDocTemplate, Paragraph, Spacer, Table, TableStyle, HRFlowable
+from reportlab.lib.styles import getSampleStyleSheet, ParagraphStyle
+from reportlab.lib.enums import TA_LEFT, TA_CENTER
 import faiss
 from sentence_transformers import SentenceTransformer
 def is_scanned_pdf(pdf_path):
     try:
         doc = fitz.open(pdf_path)
+        text_pages  = 0
         check_pages = min(5, len(doc))
         for i in range(check_pages):
             text = doc[i].get_text().strip()
         return False
 def smart_sample_pages(total_pages):
     if total_pages <= 50:
         return list(range(total_pages))
     elif total_pages <= 200:
         doc.close()
         if len(text) > 50 and not is_scanned:
             return page_num, text
         doc2  = fitz.open(pdf_path)
         page2 = doc2[page_num]
         pix   = page2.get_pixmap(matrix=fitz.Matrix(1.5, 1.5))
         ocr_text = pytesseract.image_to_string(img, lang='eng', config='--psm 6')
         doc2.close()
         return page_num, ocr_text
+    except:
         return page_num, ""
 def filter_text(text):
     return text.strip()
 def extract_text_from_pdf(pdf_path, progress_cb=None):
+    doc           = fitz.open(pdf_path)
+    total_pages   = len(doc)
     doc.close()
+    scanned       = is_scanned_pdf(pdf_path)
+    pages_list    = smart_sample_pages(total_pages)
     total_sampled = len(pages_list)
     print(f"PDF: {total_pages} pages, scanned={scanned}, sampling {total_sampled} pages")
     results = {}
     done    = [0]
     with ThreadPoolExecutor(max_workers=MAX_WORKERS) as executor:
+        futures = {executor.submit(process_page, (i, pdf_path, scanned)): i for i in pages_list}
         for future in as_completed(futures):
             page_num, text = future.result()
             clean = filter_text(text)
     print(f"Extracted {len(full_text)} characters from {len(results)} pages")
     return full_text.strip(), scanned
+# ── Smart Chunking ─────────────────────────────────────────────────────────────
 def split_into_chunks(text, chunk_size=CHUNK_SIZE, overlap=CHUNK_OVERLAP):
     words  = text.split()
     chunks = []
                 q['difficulty'] = difficulty
                 q['topic']      = topic
                 q['id']         = str(uuid.uuid4())[:8]
                 if q.get('is_multi') and isinstance(q.get('correct_answer'), str):
                     q['correct_answer'] = [q['correct_answer']]
                 if not q.get('is_multi') and isinstance(q.get('correct_answer'), list):
                 "options":        question.get("options", {}),
                 "score":          4 if is_correct else -1
             }
     elif question_type == "fill":
         correct_ans = question.get("correct_answer", "")
         is_correct  = user_answer.strip().lower() == correct_ans.strip().lower()
             "options":        {},
             "score":          4 if is_correct else -1
         }
     elif question_type == "long":
         prompt = f"""You are a strict examiner. Grade this answer.
 Question: {question['question']}
         all_text     = ""
         scanned_warn = False
         progress_cb(5, "Extracting text from PDFs...")
         pdf_results = {}
         print(f"Total combined text: {len(all_text)} chars")
         progress_cb(38, "Splitting into chunks...")
         all_chunks = split_into_chunks(all_text)
         print(f"Total chunks: {len(all_chunks)}")
         progress_cb(42, "Building vector index...")
         faiss_index = build_faiss_index(all_chunks, progress_cb)
         progress_cb(67, "Detecting topics...")
         topics = detect_topics(all_text[:6000])
         progress_cb(70, f"Found {len(topics)} topics: {', '.join(topics[:3])}...")
         all_questions = []
         qpt       = max(1, count // len(topics))
         remainder = count - (qpt * len(topics))
             job_store[job_id]["message"] = "No questions could be generated. Try different settings."
             return
         progress_cb(97, "Setting up your exam...")
         sid = create_session(all_questions, exam_mode=exam_mode, time_limit=time_limit)
         sessions[sid]["sources"]      = file_names
     if not question:
         return {"error": "Question not found"}
     if question_id in session["answers"]:
         return {"status": "already_answered", **session["answers"][question_id]}
     session = sessions.get(session_id)
     if not session:
         return JSONResponse({"error": "Session not found"})
+    path = f"/tmp/results_{session_id}.pdf"
+    doc  = SimpleDocTemplate(
+        path, pagesize=A4,
+        rightMargin=15*mm, leftMargin=15*mm,
+        topMargin=15*mm,   bottomMargin=15*mm
+    )
+    # ── Styles ─────────────────────────────────────────────
+    title_style = ParagraphStyle("title_style", fontName="Helvetica-Bold", fontSize=18, textColor=colors.HexColor("#2563eb"), spaceAfter=4, alignment=TA_CENTER)
+    subtitle_style = ParagraphStyle("subtitle_style", fontName="Helvetica", fontSize=9, textColor=colors.HexColor("#64748b"), spaceAfter=12, alignment=TA_CENTER)
+    q_style = ParagraphStyle("q_style", fontName="Helvetica-Bold", fontSize=10, textColor=colors.HexColor("#0f172a"), spaceAfter=6, leading=15)
+    opt_style = ParagraphStyle("opt_style", fontName="Helvetica", fontSize=9, textColor=colors.HexColor("#334155"), spaceAfter=3, leftIndent=6, leading=13)
+    opt_correct_style = ParagraphStyle("opt_correct_style", fontName="Helvetica-Bold", fontSize=9, textColor=colors.HexColor("#16a34a"), spaceAfter=3, leftIndent=6, leading=13)
+    opt_wrong_style = ParagraphStyle("opt_wrong_style", fontName="Helvetica", fontSize=9, textColor=colors.HexColor("#dc2626"), spaceAfter=3, leftIndent=6, leading=13)
+    expl_label_style = ParagraphStyle("expl_label_style", fontName="Helvetica-Bold", fontSize=8, textColor=colors.HexColor("#2563eb"), spaceAfter=2, leftIndent=6)
+    expl_style = ParagraphStyle("expl_style", fontName="Helvetica", fontSize=8.5, textColor=colors.HexColor("#334155"), spaceAfter=4, leftIndent=6, leading=13)
+    status_correct = ParagraphStyle("status_correct", fontName="Helvetica-Bold", fontSize=8, textColor=colors.HexColor("#16a34a"), spaceAfter=5)
+    status_wrong   = ParagraphStyle("status_wrong",   fontName="Helvetica-Bold", fontSize=8, textColor=colors.HexColor("#dc2626"), spaceAfter=5)
+    status_skipped = ParagraphStyle("status_skipped", fontName="Helvetica-Bold", fontSize=8, textColor=colors.HexColor("#d97706"), spaceAfter=5)
+    status_partial = ParagraphStyle("status_partial", fontName="Helvetica-Bold", fontSize=8, textColor=colors.HexColor("#2563eb"), spaceAfter=5)
+    story = []
+    # ── Title ──────────────────────────────────────────────
+    story.append(Paragraph("Exam Review", title_style))
+    total   = len(session["questions"])
+    correct = session["correct"]
+    wrong   = session["wrong"]
+    skipped = session["skipped"]
+    pct     = round((correct / total) * 100, 1) if total > 0 else 0
+    story.append(Paragraph(
+        f"Score: {correct}/{total} correct  ({pct}%)   |   Wrong: {wrong}   |   Skipped: {skipped}",
+        subtitle_style
+    ))
+    story.append(HRFlowable(width="100%", thickness=1, color=colors.HexColor("#dbeafe"), spaceAfter=10))
+    # ── Each Question Card ─────────────────────────────────
     for i, item in enumerate(session["history"]):
+        is_correct = item.get("correct", False)
+        is_partial = item.get("partial", False)
+        is_skipped = not item.get("user_answer") or item.get("timed_out", False)
+        is_wrong   = not is_correct and not is_partial and not is_skipped
+        if is_correct:
+            border_color = colors.HexColor("#16a34a")
+            status_text  = "✓  Correct"
+            s_style      = status_correct
+        elif is_partial:
+            border_color = colors.HexColor("#2563eb")
+            status_text  = "◑  Partial"
+            s_style      = status_partial
+        elif is_skipped:
+            border_color = colors.HexColor("#d97706")
+            status_text  = "○  Skipped"
+            s_style      = status_skipped
+        else:
+            border_color = colors.HexColor("#dc2626")
+            status_text  = "✗  Wrong"
+            s_style      = status_wrong
+        inner = []
+        inner.append(Paragraph(f"Q{i+1}.  {item['question']}", q_style))
+        inner.append(Paragraph(status_text, s_style))
+        # All options
+        options = item.get("options", {})
+        if options:
+            correct_keys = [k.strip().upper() for k in item.get("correct_answer", "").split(",")]
+            user_keys    = [k.strip().upper() for k in item.get("user_answer", "").split(",") if item.get("user_answer")]
+            for key, val in options.items():
+                k              = key.strip().upper()
+                is_correct_opt = k in correct_keys
+                is_user_pick   = k in user_keys
+                if is_correct_opt:
+                    prefix = f"✓  {key}."
+                    style  = opt_correct_style
+                elif is_user_pick and is_wrong:
+                    prefix = f"✗  {key}."
+                    style  = opt_wrong_style
+                else:
+                    prefix = f"     {key}."
+                    style  = opt_style
+                inner.append(Paragraph(f"{prefix}  {val}", style))
+        else:
+            # Fill / Long answer
+            if item.get("user_answer"):
+                inner.append(Paragraph(f"Your Answer:  {item['user_answer']}", opt_wrong_style if is_wrong else opt_correct_style))
+            inner.append(Paragraph(f"Correct Answer:  {item['correct_answer']}", opt_correct_style))
+        # Explanation always shown
         if item.get("explanation"):
+            inner.append(Spacer(1, 4))
+            inner.append(Paragraph("EXPLANATION", expl_label_style))
+            inner.append(Paragraph(item["explanation"], expl_style))
+        # Card with colored left border
+        card_table = Table([[inner]], colWidths=[170*mm])
+        card_table.setStyle(TableStyle([
+            ("BOX",           (0,0), (-1,-1), 0.5, colors.HexColor("#d0ddef")),
+            ("LINEBEFORE",    (0,0), (0,-1),  3,   border_color),
+            ("BACKGROUND",    (0,0), (-1,-1), colors.HexColor("#f7faff")),
+            ("TOPPADDING",    (0,0), (-1,-1), 8),
+            ("BOTTOMPADDING", (0,0), (-1,-1), 8),
+            ("LEFTPADDING",   (0,0), (-1,-1), 10),
+            ("RIGHTPADDING",  (0,0), (-1,-1), 8),
+        ]))
+        story.append(card_table)
+        story.append(Spacer(1, 6))
     doc.build(story)
+    return FileResponse(path, filename=f"exam_review_{session_id}.pdf", media_type="application/pdf")
 @app.get("/health")
 async def health():