import os import re import pandas as pd from typing import List, Dict def extract_mcqs_from_file(filepath: str, raw_text: str = None) -> List[Dict]: if not raw_text: ext = filepath.rsplit('.', 1)[-1].lower() if ext == 'pdf': import pdfplumber pages = [] with pdfplumber.open(filepath) as pdf: for page in pdf.pages: pages.append(page.extract_text() or '') raw_text = '\n'.join(pages) elif ext == 'docx': from docx import Document doc = Document(filepath) raw_text = '\n'.join(p.text for p in doc.paragraphs) elif ext in ['xls', 'xlsx']: df = pd.read_excel(filepath) return df.to_dict(orient='records') elif ext == 'csv': df = pd.read_csv(filepath) return df.to_dict(orient='records') else: raise ValueError('Unsupported file format') # Normalize newlines text = re.sub(r"\r\n?", "\n", raw_text) # Split text into segments starting with numbered questions segments = re.split(r"(?m)(?=^\s*(?:Q\d+|\d{1,3})[\)\.])", text) mcqs = [] for seg in segments: lines = [ln.strip() for ln in seg.splitlines() if ln.strip()] if not lines: continue question = '' options = [] answer = '' explanation = '' for ln in lines: # Identify question (first line or continuation) q_match = re.match(r"^(?:Q\d+|(\d{1,3}))[\)\.]\s*(.*)", ln) if q_match and not question: question = q_match.group(2).strip() continue elif not options and not answer and not re.match(r"^[A-Da-d][\)\.]\s*", ln): question += ' ' + ln # continuation of question continue # Identify options opt_match = re.match(r"^([A-Da-d])[\)\.]\s*(.*)", ln) if opt_match: options.append(opt_match.group(2).strip()) continue # Identify answer ans_match = re.match(r"^(?:Answer|Ans|Correct answer)[\s:\-]*\(?([A-Da-d])\)?.*", ln, re.IGNORECASE) if ans_match: answer = ans_match.group(1).upper() continue # Identify explicit explanation exp_match = re.match(r"^(?:Explanation|Why|Because)[\s:\-]*(.*)", ln, re.IGNORECASE) if exp_match: explanation = exp_match.group(1).strip() continue # Implicit continuation of explanation if answer and not re.match(r"^(?:Q\d+|[A-Da-d][\)\.]|\d{1,3}[\)\.])", ln): explanation += ' ' + ln.strip() # Save only valid MCQs (at least question + 2 options) if question and len(options) >= 2: mcqs.append({ 'question': question.strip(), 'options': options, 'answer': answer, 'explanation': explanation.strip() }) return mcqs # ======================== # Split MCQs into chunks # ======================== def split_large_mcq_list(mcqs: List[Dict], max_per_chunk=500) -> List[List[Dict]]: if len(mcqs) <= max_per_chunk: return [mcqs] return [mcqs[i:i + max_per_chunk] for i in range(0, len(mcqs), max_per_chunk)]