pavan1221 commited on
Commit
524cbae
Β·
verified Β·
1 Parent(s): 5a011c0

Update app/main.py

Browse files
Files changed (1) hide show
  1. app/main.py +121 -68
app/main.py CHANGED
@@ -12,8 +12,10 @@ import numpy as np
12
  from concurrent.futures import ThreadPoolExecutor, as_completed
13
  from reportlab.lib.pagesizes import A4
14
  from reportlab.lib import colors
15
- from reportlab.platypus import SimpleDocTemplate, Paragraph, Spacer, Table, TableStyle
16
- from reportlab.lib.styles import getSampleStyleSheet
 
 
17
  import faiss
18
  from sentence_transformers import SentenceTransformer
19
 
@@ -109,7 +111,7 @@ def call_llm(prompt, max_tokens=2000):
109
  def is_scanned_pdf(pdf_path):
110
  try:
111
  doc = fitz.open(pdf_path)
112
- text_pages = 0
113
  check_pages = min(5, len(doc))
114
  for i in range(check_pages):
115
  text = doc[i].get_text().strip()
@@ -121,13 +123,6 @@ def is_scanned_pdf(pdf_path):
121
  return False
122
 
123
  def smart_sample_pages(total_pages):
124
- """
125
- Smart page sampling:
126
- - Small (<50 pages): read all
127
- - Medium (<200 pages): every 2nd page
128
- - Large (200+ pages): every 5th page
129
- Always include first and last 5 pages.
130
- """
131
  if total_pages <= 50:
132
  return list(range(total_pages))
133
  elif total_pages <= 200:
@@ -148,7 +143,6 @@ def process_page(args):
148
  doc.close()
149
  if len(text) > 50 and not is_scanned:
150
  return page_num, text
151
- # OCR fallback
152
  doc2 = fitz.open(pdf_path)
153
  page2 = doc2[page_num]
154
  pix = page2.get_pixmap(matrix=fitz.Matrix(1.5, 1.5))
@@ -157,7 +151,7 @@ def process_page(args):
157
  ocr_text = pytesseract.image_to_string(img, lang='eng', config='--psm 6')
158
  doc2.close()
159
  return page_num, ocr_text
160
- except Exception as e:
161
  return page_num, ""
162
 
163
  def filter_text(text):
@@ -169,20 +163,17 @@ def filter_text(text):
169
  return text.strip()
170
 
171
  def extract_text_from_pdf(pdf_path, progress_cb=None):
172
- doc = fitz.open(pdf_path)
173
- total_pages = len(doc)
174
  doc.close()
175
- scanned = is_scanned_pdf(pdf_path)
176
- pages_list = smart_sample_pages(total_pages)
177
  total_sampled = len(pages_list)
178
  print(f"PDF: {total_pages} pages, scanned={scanned}, sampling {total_sampled} pages")
179
  results = {}
180
  done = [0]
181
  with ThreadPoolExecutor(max_workers=MAX_WORKERS) as executor:
182
- futures = {
183
- executor.submit(process_page, (i, pdf_path, scanned)): i
184
- for i in pages_list
185
- }
186
  for future in as_completed(futures):
187
  page_num, text = future.result()
188
  clean = filter_text(text)
@@ -196,7 +187,7 @@ def extract_text_from_pdf(pdf_path, progress_cb=None):
196
  print(f"Extracted {len(full_text)} characters from {len(results)} pages")
197
  return full_text.strip(), scanned
198
 
199
- # ── Smart Chunking with Overlap ────────────────────────────────────────────────
200
  def split_into_chunks(text, chunk_size=CHUNK_SIZE, overlap=CHUNK_OVERLAP):
201
  words = text.split()
202
  chunks = []
@@ -344,7 +335,6 @@ CONTENT:
344
  q['difficulty'] = difficulty
345
  q['topic'] = topic
346
  q['id'] = str(uuid.uuid4())[:8]
347
- # Normalize correct_answer types
348
  if q.get('is_multi') and isinstance(q.get('correct_answer'), str):
349
  q['correct_answer'] = [q['correct_answer']]
350
  if not q.get('is_multi') and isinstance(q.get('correct_answer'), list):
@@ -387,7 +377,6 @@ def evaluate_answer(question, user_answer, question_type):
387
  "options": question.get("options", {}),
388
  "score": 4 if is_correct else -1
389
  }
390
-
391
  elif question_type == "fill":
392
  correct_ans = question.get("correct_answer", "")
393
  is_correct = user_answer.strip().lower() == correct_ans.strip().lower()
@@ -403,7 +392,6 @@ def evaluate_answer(question, user_answer, question_type):
403
  "options": {},
404
  "score": 4 if is_correct else -1
405
  }
406
-
407
  elif question_type == "long":
408
  prompt = f"""You are a strict examiner. Grade this answer.
409
  Question: {question['question']}
@@ -482,7 +470,6 @@ def process_pdfs_background(job_id, file_paths, file_names, question_type, diffi
482
  all_text = ""
483
  scanned_warn = False
484
 
485
- # ── Step 1: Extract text from all PDFs in parallel ──
486
  progress_cb(5, "Extracting text from PDFs...")
487
  pdf_results = {}
488
 
@@ -510,21 +497,17 @@ def process_pdfs_background(job_id, file_paths, file_names, question_type, diffi
510
 
511
  print(f"Total combined text: {len(all_text)} chars")
512
 
513
- # ── Step 2: Chunking ────────────────────────────────
514
  progress_cb(38, "Splitting into chunks...")
515
  all_chunks = split_into_chunks(all_text)
516
  print(f"Total chunks: {len(all_chunks)}")
517
 
518
- # ── Step 3: FAISS index ─────────────────────────────
519
  progress_cb(42, "Building vector index...")
520
  faiss_index = build_faiss_index(all_chunks, progress_cb)
521
 
522
- # ── Step 4: Topic detection ─────────────────────────
523
  progress_cb(67, "Detecting topics...")
524
  topics = detect_topics(all_text[:6000])
525
  progress_cb(70, f"Found {len(topics)} topics: {', '.join(topics[:3])}...")
526
 
527
- # ── Step 5: Generate questions per topic ────────────
528
  all_questions = []
529
  qpt = max(1, count // len(topics))
530
  remainder = count - (qpt * len(topics))
@@ -550,7 +533,6 @@ def process_pdfs_background(job_id, file_paths, file_names, question_type, diffi
550
  job_store[job_id]["message"] = "No questions could be generated. Try different settings."
551
  return
552
 
553
- # ── Step 6: Create session ──────────────────────────
554
  progress_cb(97, "Setting up your exam...")
555
  sid = create_session(all_questions, exam_mode=exam_mode, time_limit=time_limit)
556
  sessions[sid]["sources"] = file_names
@@ -673,7 +655,6 @@ async def answer(
673
  if not question:
674
  return {"error": "Question not found"}
675
 
676
- # Prevent double counting if user navigates back
677
  if question_id in session["answers"]:
678
  return {"status": "already_answered", **session["answers"][question_id]}
679
 
@@ -779,46 +760,118 @@ async def export(session_id: str):
779
  session = sessions.get(session_id)
780
  if not session:
781
  return JSONResponse({"error": "Session not found"})
782
- total = len(session["questions"])
783
- path = f"/tmp/results_{session_id}.pdf"
784
- doc = SimpleDocTemplate(path, pagesize=A4)
785
- styles = getSampleStyleSheet()
786
- story = []
787
- story.append(Paragraph("Exam Results Report", styles["Title"]))
788
- story.append(Spacer(1, 12))
789
- summary_data = [
790
- ["Total Questions", str(total)],
791
- ["Attempted", str(session["total_attempted"])],
792
- ["Correct", str(session["correct"])],
793
- ["Wrong", str(session["wrong"])],
794
- ["Skipped", str(session["skipped"])],
795
- ["Final Score", f"{session['score']} / {total * 4}"],
796
- ["Percentage", f"{round((session['correct']/total)*100,1) if total>0 else 0}%"],
797
- ]
798
- table = Table(summary_data, colWidths=[200, 200])
799
- table.setStyle(TableStyle([
800
- ("BACKGROUND", (0, 0), (-1, 0), colors.HexColor("#2563eb")),
801
- ("TEXTCOLOR", (0, 0), (-1, 0), colors.whitesmoke),
802
- ("ALIGN", (0, 0), (-1, -1), "CENTER"),
803
- ("FONTSIZE", (0, 0), (-1, -1), 12),
804
- ("ROWBACKGROUNDS", (0, 0), (-1, -1), [colors.HexColor("#eff6ff"), colors.white]),
805
- ("GRID", (0, 0), (-1, -1), 1, colors.HexColor("#d0ddef"))
806
- ]))
807
- story.append(table)
808
- story.append(Spacer(1, 20))
809
- story.append(Paragraph("Question Review", styles["Heading2"]))
810
- story.append(Spacer(1, 10))
 
 
 
 
 
 
 
 
 
811
  for i, item in enumerate(session["history"]):
812
- status = "βœ“ Correct" if item["correct"] else ("⊘ Skipped" if not item.get("user_answer") else "βœ— Wrong")
813
- story.append(Paragraph(f"Q{i+1}: {item['question']} [{status}]", styles["Normal"]))
814
- if item.get("user_answer"):
815
- story.append(Paragraph(f"Your Answer: {item['user_answer']}", styles["Normal"]))
816
- story.append(Paragraph(f"Correct Answer: {item['correct_answer']}", styles["Normal"]))
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
817
  if item.get("explanation"):
818
- story.append(Paragraph(f"Explanation: {item['explanation']}", styles["Normal"]))
819
- story.append(Spacer(1, 8))
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
820
  doc.build(story)
821
- return FileResponse(path, filename=f"results_{session_id}.pdf", media_type="application/pdf")
822
 
823
  @app.get("/health")
824
  async def health():
 
12
  from concurrent.futures import ThreadPoolExecutor, as_completed
13
  from reportlab.lib.pagesizes import A4
14
  from reportlab.lib import colors
15
+ from reportlab.lib.units import mm
16
+ from reportlab.platypus import SimpleDocTemplate, Paragraph, Spacer, Table, TableStyle, HRFlowable
17
+ from reportlab.lib.styles import getSampleStyleSheet, ParagraphStyle
18
+ from reportlab.lib.enums import TA_LEFT, TA_CENTER
19
  import faiss
20
  from sentence_transformers import SentenceTransformer
21
 
 
111
  def is_scanned_pdf(pdf_path):
112
  try:
113
  doc = fitz.open(pdf_path)
114
+ text_pages = 0
115
  check_pages = min(5, len(doc))
116
  for i in range(check_pages):
117
  text = doc[i].get_text().strip()
 
123
  return False
124
 
125
  def smart_sample_pages(total_pages):
 
 
 
 
 
 
 
126
  if total_pages <= 50:
127
  return list(range(total_pages))
128
  elif total_pages <= 200:
 
143
  doc.close()
144
  if len(text) > 50 and not is_scanned:
145
  return page_num, text
 
146
  doc2 = fitz.open(pdf_path)
147
  page2 = doc2[page_num]
148
  pix = page2.get_pixmap(matrix=fitz.Matrix(1.5, 1.5))
 
151
  ocr_text = pytesseract.image_to_string(img, lang='eng', config='--psm 6')
152
  doc2.close()
153
  return page_num, ocr_text
154
+ except:
155
  return page_num, ""
156
 
157
  def filter_text(text):
 
163
  return text.strip()
164
 
165
  def extract_text_from_pdf(pdf_path, progress_cb=None):
166
+ doc = fitz.open(pdf_path)
167
+ total_pages = len(doc)
168
  doc.close()
169
+ scanned = is_scanned_pdf(pdf_path)
170
+ pages_list = smart_sample_pages(total_pages)
171
  total_sampled = len(pages_list)
172
  print(f"PDF: {total_pages} pages, scanned={scanned}, sampling {total_sampled} pages")
173
  results = {}
174
  done = [0]
175
  with ThreadPoolExecutor(max_workers=MAX_WORKERS) as executor:
176
+ futures = {executor.submit(process_page, (i, pdf_path, scanned)): i for i in pages_list}
 
 
 
177
  for future in as_completed(futures):
178
  page_num, text = future.result()
179
  clean = filter_text(text)
 
187
  print(f"Extracted {len(full_text)} characters from {len(results)} pages")
188
  return full_text.strip(), scanned
189
 
190
+ # ── Smart Chunking ─────────────────────────────────────────────────────────────
191
  def split_into_chunks(text, chunk_size=CHUNK_SIZE, overlap=CHUNK_OVERLAP):
192
  words = text.split()
193
  chunks = []
 
335
  q['difficulty'] = difficulty
336
  q['topic'] = topic
337
  q['id'] = str(uuid.uuid4())[:8]
 
338
  if q.get('is_multi') and isinstance(q.get('correct_answer'), str):
339
  q['correct_answer'] = [q['correct_answer']]
340
  if not q.get('is_multi') and isinstance(q.get('correct_answer'), list):
 
377
  "options": question.get("options", {}),
378
  "score": 4 if is_correct else -1
379
  }
 
380
  elif question_type == "fill":
381
  correct_ans = question.get("correct_answer", "")
382
  is_correct = user_answer.strip().lower() == correct_ans.strip().lower()
 
392
  "options": {},
393
  "score": 4 if is_correct else -1
394
  }
 
395
  elif question_type == "long":
396
  prompt = f"""You are a strict examiner. Grade this answer.
397
  Question: {question['question']}
 
470
  all_text = ""
471
  scanned_warn = False
472
 
 
473
  progress_cb(5, "Extracting text from PDFs...")
474
  pdf_results = {}
475
 
 
497
 
498
  print(f"Total combined text: {len(all_text)} chars")
499
 
 
500
  progress_cb(38, "Splitting into chunks...")
501
  all_chunks = split_into_chunks(all_text)
502
  print(f"Total chunks: {len(all_chunks)}")
503
 
 
504
  progress_cb(42, "Building vector index...")
505
  faiss_index = build_faiss_index(all_chunks, progress_cb)
506
 
 
507
  progress_cb(67, "Detecting topics...")
508
  topics = detect_topics(all_text[:6000])
509
  progress_cb(70, f"Found {len(topics)} topics: {', '.join(topics[:3])}...")
510
 
 
511
  all_questions = []
512
  qpt = max(1, count // len(topics))
513
  remainder = count - (qpt * len(topics))
 
533
  job_store[job_id]["message"] = "No questions could be generated. Try different settings."
534
  return
535
 
 
536
  progress_cb(97, "Setting up your exam...")
537
  sid = create_session(all_questions, exam_mode=exam_mode, time_limit=time_limit)
538
  sessions[sid]["sources"] = file_names
 
655
  if not question:
656
  return {"error": "Question not found"}
657
 
 
658
  if question_id in session["answers"]:
659
  return {"status": "already_answered", **session["answers"][question_id]}
660
 
 
760
  session = sessions.get(session_id)
761
  if not session:
762
  return JSONResponse({"error": "Session not found"})
763
+
764
+ path = f"/tmp/results_{session_id}.pdf"
765
+ doc = SimpleDocTemplate(
766
+ path, pagesize=A4,
767
+ rightMargin=15*mm, leftMargin=15*mm,
768
+ topMargin=15*mm, bottomMargin=15*mm
769
+ )
770
+
771
+ # ── Styles ─────────────────────────────────────────────
772
+ title_style = ParagraphStyle("title_style", fontName="Helvetica-Bold", fontSize=18, textColor=colors.HexColor("#2563eb"), spaceAfter=4, alignment=TA_CENTER)
773
+ subtitle_style = ParagraphStyle("subtitle_style", fontName="Helvetica", fontSize=9, textColor=colors.HexColor("#64748b"), spaceAfter=12, alignment=TA_CENTER)
774
+ q_style = ParagraphStyle("q_style", fontName="Helvetica-Bold", fontSize=10, textColor=colors.HexColor("#0f172a"), spaceAfter=6, leading=15)
775
+ opt_style = ParagraphStyle("opt_style", fontName="Helvetica", fontSize=9, textColor=colors.HexColor("#334155"), spaceAfter=3, leftIndent=6, leading=13)
776
+ opt_correct_style = ParagraphStyle("opt_correct_style", fontName="Helvetica-Bold", fontSize=9, textColor=colors.HexColor("#16a34a"), spaceAfter=3, leftIndent=6, leading=13)
777
+ opt_wrong_style = ParagraphStyle("opt_wrong_style", fontName="Helvetica", fontSize=9, textColor=colors.HexColor("#dc2626"), spaceAfter=3, leftIndent=6, leading=13)
778
+ expl_label_style = ParagraphStyle("expl_label_style", fontName="Helvetica-Bold", fontSize=8, textColor=colors.HexColor("#2563eb"), spaceAfter=2, leftIndent=6)
779
+ expl_style = ParagraphStyle("expl_style", fontName="Helvetica", fontSize=8.5, textColor=colors.HexColor("#334155"), spaceAfter=4, leftIndent=6, leading=13)
780
+ status_correct = ParagraphStyle("status_correct", fontName="Helvetica-Bold", fontSize=8, textColor=colors.HexColor("#16a34a"), spaceAfter=5)
781
+ status_wrong = ParagraphStyle("status_wrong", fontName="Helvetica-Bold", fontSize=8, textColor=colors.HexColor("#dc2626"), spaceAfter=5)
782
+ status_skipped = ParagraphStyle("status_skipped", fontName="Helvetica-Bold", fontSize=8, textColor=colors.HexColor("#d97706"), spaceAfter=5)
783
+ status_partial = ParagraphStyle("status_partial", fontName="Helvetica-Bold", fontSize=8, textColor=colors.HexColor("#2563eb"), spaceAfter=5)
784
+
785
+ story = []
786
+
787
+ # ── Title ──────────────────────────────────────────────
788
+ story.append(Paragraph("Exam Review", title_style))
789
+ total = len(session["questions"])
790
+ correct = session["correct"]
791
+ wrong = session["wrong"]
792
+ skipped = session["skipped"]
793
+ pct = round((correct / total) * 100, 1) if total > 0 else 0
794
+ story.append(Paragraph(
795
+ f"Score: {correct}/{total} correct ({pct}%) | Wrong: {wrong} | Skipped: {skipped}",
796
+ subtitle_style
797
+ ))
798
+ story.append(HRFlowable(width="100%", thickness=1, color=colors.HexColor("#dbeafe"), spaceAfter=10))
799
+
800
+ # ── Each Question Card ─────────────────────────────────
801
  for i, item in enumerate(session["history"]):
802
+ is_correct = item.get("correct", False)
803
+ is_partial = item.get("partial", False)
804
+ is_skipped = not item.get("user_answer") or item.get("timed_out", False)
805
+ is_wrong = not is_correct and not is_partial and not is_skipped
806
+
807
+ if is_correct:
808
+ border_color = colors.HexColor("#16a34a")
809
+ status_text = "βœ“ Correct"
810
+ s_style = status_correct
811
+ elif is_partial:
812
+ border_color = colors.HexColor("#2563eb")
813
+ status_text = "β—‘ Partial"
814
+ s_style = status_partial
815
+ elif is_skipped:
816
+ border_color = colors.HexColor("#d97706")
817
+ status_text = "β—‹ Skipped"
818
+ s_style = status_skipped
819
+ else:
820
+ border_color = colors.HexColor("#dc2626")
821
+ status_text = "βœ— Wrong"
822
+ s_style = status_wrong
823
+
824
+ inner = []
825
+ inner.append(Paragraph(f"Q{i+1}. {item['question']}", q_style))
826
+ inner.append(Paragraph(status_text, s_style))
827
+
828
+ # All options
829
+ options = item.get("options", {})
830
+ if options:
831
+ correct_keys = [k.strip().upper() for k in item.get("correct_answer", "").split(",")]
832
+ user_keys = [k.strip().upper() for k in item.get("user_answer", "").split(",") if item.get("user_answer")]
833
+ for key, val in options.items():
834
+ k = key.strip().upper()
835
+ is_correct_opt = k in correct_keys
836
+ is_user_pick = k in user_keys
837
+ if is_correct_opt:
838
+ prefix = f"βœ“ {key}."
839
+ style = opt_correct_style
840
+ elif is_user_pick and is_wrong:
841
+ prefix = f"βœ— {key}."
842
+ style = opt_wrong_style
843
+ else:
844
+ prefix = f" {key}."
845
+ style = opt_style
846
+ inner.append(Paragraph(f"{prefix} {val}", style))
847
+ else:
848
+ # Fill / Long answer
849
+ if item.get("user_answer"):
850
+ inner.append(Paragraph(f"Your Answer: {item['user_answer']}", opt_wrong_style if is_wrong else opt_correct_style))
851
+ inner.append(Paragraph(f"Correct Answer: {item['correct_answer']}", opt_correct_style))
852
+
853
+ # Explanation always shown
854
  if item.get("explanation"):
855
+ inner.append(Spacer(1, 4))
856
+ inner.append(Paragraph("EXPLANATION", expl_label_style))
857
+ inner.append(Paragraph(item["explanation"], expl_style))
858
+
859
+ # Card with colored left border
860
+ card_table = Table([[inner]], colWidths=[170*mm])
861
+ card_table.setStyle(TableStyle([
862
+ ("BOX", (0,0), (-1,-1), 0.5, colors.HexColor("#d0ddef")),
863
+ ("LINEBEFORE", (0,0), (0,-1), 3, border_color),
864
+ ("BACKGROUND", (0,0), (-1,-1), colors.HexColor("#f7faff")),
865
+ ("TOPPADDING", (0,0), (-1,-1), 8),
866
+ ("BOTTOMPADDING", (0,0), (-1,-1), 8),
867
+ ("LEFTPADDING", (0,0), (-1,-1), 10),
868
+ ("RIGHTPADDING", (0,0), (-1,-1), 8),
869
+ ]))
870
+ story.append(card_table)
871
+ story.append(Spacer(1, 6))
872
+
873
  doc.build(story)
874
+ return FileResponse(path, filename=f"exam_review_{session_id}.pdf", media_type="application/pdf")
875
 
876
  @app.get("/health")
877
  async def health():